Ejemplo n.º 1
0
def main(cm_host, user, password):
    api = ApiResource(cm_host, username=user, password=password)
    cm = api.get_cloudera_manager()
    cm.update_all_hosts_config(
        {"java_home": "/usr/java/jdk1.8.0_121-cloudera"})
    print("restarting CM service - this will take a minute or so")
    cm.get_service().restart().wait()
    print("restarting cluster - this will take 2-5 minutes")
    api.get_all_clusters()[0].restart(restart_only_stale_services=True,
                                      redeploy_client_configuration=True).wait()
Ejemplo n.º 2
0
def main(cm_host, user, password):
    api = ApiResource(cm_host, username=user, password=password)
    cm = api.get_cloudera_manager()
    config = cm.get_all_hosts_config(view='full')
    if config['java_home'].value == "/usr/java/jdk1.8.0_121-cloudera":
        print "Java home already set - skipping"
    else:
        print "Updating jdk location"
        cm.update_all_hosts_config(
            {"java_home": "/usr/java/jdk1.8.0_121-cloudera"})
        print("restarting CM service - this will take a minute or so")
        cm.get_service().restart().wait()
        print("restarting cluster - this will take 2-5 minutes")
        api.get_all_clusters()[0].restart(restart_only_stale_services=True,
                                          redeploy_client_configuration=True).wait()
Ejemplo n.º 3
0
def do_call(host, port, user, password, cluster_name, service_role_name,
            random_index):
    api = ApiResource(host, port, user, password, False, MAN_API_VERSION)
    for cluster in api.get_all_clusters():
        if cluster_name is None:
            break
        elif cluster_name == cluster.name:
            break
    if cluster_name is not None and cluster_name != cluster.name:
        print >> sys.stderr, "Cloud not find cluster: " + cluster_name
        return -2
    do_print_header()
    for service in cluster.get_all_services():
        do_print_line_item(api, service, service_role_name, random_index,
                           'HDFS', 'NAMENODE', 'namenode_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'KUDU', 'KUDU_MASTER', 'webserver_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'HUE', 'HUE_SERVER', 'hue_http_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'HIVE', 'HIVESERVER2', 'hs2_thrift_address_port',
                           [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'IMPALA', 'IMPALAD', 'beeswax_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'FLUME', 'AGENT', 'agent_http_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'KAFKA', 'KAFKA_BROKER', 'port', [], [])
        do_print_line_item(api, service, service_role_name, random_index,
                           'ZOOKEEPER', 'SERVER', 'clientPort', [], [])
    do_print_footer()
Ejemplo n.º 4
0
def adjust_yarn_memory_limits(region, stack_name, restart=True):
    ec2_conn = create_ec2_connection(region)
    manager_instance = get_manager_instance(ec2_conn, stack_name)
    with cm_tunnel_ctx(manager_instance) as local_port:
        cm_api = ApiResource('localhost', username='******', password='******',
                             server_port=local_port, version=9)
        cluster = list(cm_api.get_all_clusters())[0]
        host = list(cm_api.get_all_hosts())[0]  # all hosts same instance type
        yarn = filter(lambda x: x.type == 'YARN',
                      list(cluster.get_all_services()))[0]
        rm_cg = filter(lambda x: x.roleType == 'RESOURCEMANAGER',
                       list(yarn.get_all_role_config_groups()))[0]
        nm_cg = filter(lambda x: x.roleType == 'NODEMANAGER',
                       list(yarn.get_all_role_config_groups()))[0]
        rm_cg.update_config({
            'yarn_scheduler_maximum_allocation_mb': (
                int(host.totalPhysMemBytes / 1024. / 1024.)),
            'yarn_scheduler_maximum_allocation_vcores': host.numCores})
        nm_cg.update_config({
            'yarn_nodemanager_resource_memory_mb': (
                int(host.totalPhysMemBytes / 1024. / 1024.)),
            'yarn_nodemanager_resource_cpu_vcores': host.numCores})
        cluster.deploy_client_config().wait()
        if restart:
            cluster.restart().wait()
Ejemplo n.º 5
0
def getActiveCMConfig(totalconfig):
    cmConfig = {}
    for cm in totalconfig['cmfqdn']:
        api = ApiResource(cm, totalconfig[cm]['port'], totalconfig[cm]['user'],
                          totalconfig[cm]['passwd'], totalconfig[cm]['tls'],
                          totalconfig[cm]['apiv'])
        clusters = api.get_all_clusters()
        cmConfig[cm] = {}
        for cluster in clusters:
            cmConfig[cm][cluster.displayName] = {}
            services = cluster.get_all_services()
            for service in services:
                cmConfig[cm][cluster.displayName][service.name] = {}
                cmConfig[cm][cluster.displayName][service.name]['Service'] = {}
                for name, config in service.get_config(view='full')[0].items():
                    cmConfig[cm][cluster.displayName][
                        service.name]['Service'][name] = {
                            'value': config.value,
                            'default': config.default
                        }
                for roleGroup in service.get_all_role_config_groups():
                    cmConfig[cm][cluster.displayName][service.name][
                        roleGroup.roleType] = {}
                    for name, config in roleGroup.get_config(
                            view='full').items():
                        cmConfig[cm][cluster.displayName][service.name][
                            roleGroup.roleType][name] = {
                                'value': config.value,
                                'default': config.default
                            }
                    print(roleGroup.roleType)
    #print(json.dumps(cmConfig, indent=4))
    return cmConfig
Ejemplo n.º 6
0
def get_cluster_specs():
    cm_api = ApiResource(os.environ['MANAGER_HOST'], username='******',
                         password='******', server_port=7180, version=9)
    host = list(cm_api.get_all_hosts())[0]  # all hosts same instance type
    cluster = list(cm_api.get_all_clusters())[0]
    yarn = filter(lambda x: x.type == 'YARN',
                  list(cluster.get_all_services()))[0]
    return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')),
            'num_cores': host.numCores, 'node_memory': host.totalPhysMemBytes}
Ejemplo n.º 7
0
def main():
    cmhost = os.environ['DEPLOYMENT_HOST_PORT'].split(":")[0]
    api = ApiResource(cmhost, username='******', password='******')
    all_clusters = api.get_all_clusters()
    for cluster in all_clusters:
        if (cluster.name == os.environ['CLUSTER_NAME']):
            break

    template = cluster.create_host_template("cdsw-gateway")
def do_call(host, port, version, user, password, cluster_name, parcel_name, parcel_version, parcel_repo, init_pre_dir, init_post_dir):
    api = ApiResource(host, port, user, password, False, version)
    if not parcel_repo.endswith('/'):
        parcel_repo += '/'
    if re.match(REGEX_VERSION, parcel_version) is None or re.match(REGEX_VERSION, parcel_version).group() != parcel_version:
        raise Exception('Parcel [' + parcel_name + '] is qualified by invalid version [' + parcel_version + '] expected to match regular expression [' + REGEX_VERSION + ']')
    if not parcel_repo.endswith(parcel_version + '/'):
        raise Exception('Parcel [' + parcel_name + '] is qualified by invalid version [' + parcel_version + '] when compared with repository [' + parcel_repo + ']')    
    cm_config = api.get_cloudera_manager().get_config(view='full')
    repo_config = cm_config['REMOTE_PARCEL_REPO_URLS']
    repo_list = repo_config.value or repo_config.default
    if parcel_repo not in repo_list:     
        repo_list += ',' + parcel_repo
        api.get_cloudera_manager().update_config({'REMOTE_PARCEL_REPO_URLS': repo_list})
        time.sleep(POLL_SEC)  # The parcel synchronize end-point is not exposed via the API, so sleep instead
    cluster_names = []
    if cluster_name is None:
        for cluster in api.get_all_clusters():
            cluster_names.append(cluster.name)
    else:
        cluster_names.append(cluster_name)
    for cluster_name_itr in cluster_names:
        print 'Cluster [DEPLOYMENT] starting ... '
        cluster = api.get_cluster(cluster_name_itr)
        parcel = cluster.get_parcel(parcel_name, parcel_version)
        print 'Parcel [DEPLOYMENT] starting ... '
        do_parcel_op(cluster, parcel_name, parcel_version, 'DOWNLOAD', 'AVAILABLE_REMOTELY', 'DOWNLOADED', 'start_download')
        do_parcel_op(cluster, parcel_name, parcel_version, 'DISTRIBUTE', 'DOWNLOADED', 'DISTRIBUTED', 'start_distribution')
        do_parcel_op(cluster, parcel_name, parcel_version, 'ACTIVATE', 'DISTRIBUTED', 'ACTIVATED', 'activate')
        parcel = cluster.get_parcel(parcel_name, parcel_version)
        if parcel.stage != 'ACTIVATED':
            raise Exception('Parcel is currently mid-stage [' + parcel.stage + '], please wait for this to complete')
        print 'Parcel [DEPLOYMENT] finished'
        if init_pre_dir is not None and os.path.isdir(init_pre_dir):
            print 'Cluster [PRE_INIT] starting ... '
            for script in glob.glob(init_pre_dir + '/*.sh'):
                subprocess.call([script])
            print 'Cluster [PRE_INIT] finihsed'            
        print 'Cluster [CONFIG_DEPLOYMENT] starting ... '
        cluster.deploy_client_config()
        cmd = cluster.deploy_client_config()
        if not cmd.wait(TIMEOUT_SEC).success:
            raise Exception('Failed to deploy client configs')
        print 'Cluster [CONFIG_DEPLOYMENT] finihsed'
        print 'Cluster [STOP] starting ... '
        cluster.stop().wait()
        print 'Cluster [STOP] finihsed'
        print 'Cluster [START] starting ... '
        cluster.start().wait()
        print 'Cluster [START] finihsed'
        if init_post_dir is not None and os.path.isdir(init_post_dir):
            print 'Cluster [POST_INIT] starting ... '
            for script in glob.glob(init_post_dir + '/*.sh'):
                subprocess.call([script])
            print 'Cluster [POST_INIT] finihsed'            
        print 'Cluster [DEPLOYMENT] finished'
Ejemplo n.º 9
0
 def getClusterInformation(self):
     api = ApiResource(self.cm_host, username=self.user, password=self.passwd)
     logger.info('Received; user -> %s, password -> %s, host -> %s', self.user, self.passwd, self.cm_host)
     for c in api.get_all_clusters():
         clusterInf = "Cluster name %s and version %s" %(c.name, c.version)
         #print "Cluster name %s and version %s" %(c.name, c.version)
         logger.info("Cluster name %s and version %s", c.name, c.version)
         if c.version == "CDH5":
             cdh5 = c
     return cdh5, clusterInf
Ejemplo n.º 10
0
 def services(self):
     api = ApiResource(self.host,
                       username=self.username,
                       password=self.password)
     version = None
     service_list = []
     for cluster in api.get_all_clusters():
         if cluster.version == "CDH5":
             version = cluster
     for service in version.get_all_services():
         service_list.append(service.name)
     return service_list
Ejemplo n.º 11
0
def get_cluster(cm_host, user, pwd, cluster_name):

    global api
    api = ApiResource(cm_host, username=user, password=pwd, version=12)
    for c in api.get_all_clusters():
        if cluster_name in c.name:
            # print("Cluster, Version : " + c.name + ", " + c.version)
            return True, c
        else:
            print("[ERR] : Cluster \"" + cluster_name + "\" not found at \"" +
                  cm_host + "\"")
            return False, ""
def find_impala_in_cm(cm_host, cm_user, cm_password, cm_cluster_name):
  """Finds the Impala service in CM and returns an Impala instance."""
  cm = ApiResource(cm_host, username=cm_user, password=cm_password)
  cm_impalas = [service for cluster in cm.get_all_clusters()
                if cm_cluster_name is None or cm_cluster_name == cluster.name
                for service in cluster.get_all_services() if service.type == "IMPALA"]
  if len(cm_impalas) > 1:
    raise Exception("Found %s Impala services in CM;" % len(cm_impalas) +
        " use --cm-cluster-name option to specify which one to use.")
  if len(cm_impalas) == 0:
    raise Exception("No Impala services found in CM")
  return Impala(cm_impalas[0])
Ejemplo n.º 13
0
def main():
    s, a = arg_handle()
    for i in range(0, 15):
        while True:
            try:
                cm_host = "127.0.0.1"
                api = ApiResource(cm_host, username="******", password="******")
                cdh = api.get_all_clusters()[0]
            except:
                print "Failed to connect to Cloudera Manager."
                print "Attempting to connect to Cloudera Manager..."
                time.sleep(15)
                continue
            break
    srv = cdh.get_service(s)
    actions[a](srv, s)
Ejemplo n.º 14
0
def update_cm(cm_host, cm_port, username, password):
    """Update config using the CM API (note: will restart service)"""
    elts = generate_xml_elements()
    cm_api = ApiResource(cm_host, username=username, password=password,
                         server_port=cm_port, version=9)
    cluster = list(cm_api.get_all_clusters())[0]
    hdfs = filter(lambda x: x.type == 'HDFS',
                  list(cluster.get_all_services()))[0]
    print("Updating HFDS core-site.xml safety valve...")
    _ = hdfs.update_config({
        'core_site_safety_valve': '\n'.join(tostring(e) for e in elts)})
    print("Deploying client config across the cluster...")
    cluster.deploy_client_config().wait()
    print("Restarting necessary services...")
    cluster.restart().wait()
    print("Done!")
Ejemplo n.º 15
0
def main():
    s,a = arg_handle()
    for i in range(0,15):
        while True:
            try:
                cm_host = "127.0.0.1"
                api = ApiResource(cm_host, username="******", password="******")
                cdh=api.get_all_clusters()[0]
            except:
                print "Failed to connect to Cloudera Manager."
                print "Attempting to connect to Cloudera Manager..."
                time.sleep(15)
                continue
            break
    srv=cdh.get_service(s)
    actions[a](srv,s)
Ejemplo n.º 16
0
def main(cm_host, user, password):
    api = ApiResource(cm_host, username=user, password=password)
    cluster = api.get_all_clusters()[0]
    try:
        cluster.get_service(service_name)
        print "Service %s already configured. Skipping" % service_name
    except ApiException:
        print "creating new service %s" % service_name
        add_kudu_service(cluster, service_name)
        create_kudu_roles(cluster, api.get_all_hosts())
        update_kudu_role_group_configs(cluster)
        start_service(cluster, service_name)
        update_impala_service(cluster, service_name)
        print "Waiting for cluster to restart stale services"
        cluster.restart(restart_only_stale_services=True,
                        redeploy_client_configuration=True).wait()
Ejemplo n.º 17
0
class cm_utils(object):


    def __init__(self,service,role,host,list):
        self.service = service.lower()
        self.role = role.lower()
        self.host = host.lower()
	self.list = list.lower()
        cm_host = '10.7.177.234'
        self.api = ApiResource(cm_host, username="******", password="******")
        # "ALL" if service == "None" else  service
        # "ALL" if role == "None" else role
        # "ALL" if host == "None" else host


    def main(self):

#
        s_filter = None
        for c in self.api.get_all_clusters():
            print c
            for s in c.get_all_services():
                print "SERVICE : " + s.displayName + "==============="
#                if (s.displayName.lower() == self.service) or (self.service == "all"):
                if ( self.service in s.displayName.lower() ) or (self.service == "all"):
                    s_filter = s
                    for r in s_filter.get_all_roles():
#                        print "ROLE : " + r.type + "================"
                        if (self.role in r.type.lower()) or (self.role == "all"):
                            h = r.hostRef.hostId
                            hostname,ipAddress,healthSummary = self._get_host_info(h)
                            if (self.host in hostname) or (self.host in ipAddress) or (self.host in h) or (self.host == "all"):
                                if self.list == "yes":
                                    print ipAddress
                                else:
                                    print "[" + r.type + "]" + hostname + " " + ipAddress + " " + healthSummary




    def _get_host_info(self,hostid):
        host = self.api.get_host(hostid)
#        self.hostname = host.hostname
#        self.host_ip = host.ipAddress
#        self.host_status = host.healthSummary

        return host.hostname,host.ipAddress,host.healthSummary
def main():
    """
    This is a script to export a current Cloudera Manager cluster configuration into an Hadrian supported format.
    You can then use these configuration files as the basis for your new cluster configs.
    """

    parser = argparse.ArgumentParser(description='Export Cloudera Manager configs in an Hadrian friendly format.')
    parser.add_argument('-H', '--host', '--hostname', action='store', dest='hostname', required=True, help='CM Server Name')
    parser.add_argument('-p', '--port', action='store', dest='port', type=int, default=7180, help='CM Port')
    parser.add_argument('-u', '--user', '--username', action='store', dest='username', required=True, help='CM username')
    args = parser.parse_args()

    password = getpass.getpass('Please enter your Cloudera Manager passsword: ')
    api = ApiResource(args.hostname, args.port, args.username, password, version=4)

    for cluster in api.get_all_clusters():
        conf_dir = './confs/' + cluster.name
        if not os.path.exists(conf_dir):
            os.makedirs(conf_dir)

            for service in cluster.get_all_services():
                with open(conf_dir + '/' + service.name + '.ini', 'w') as f:
                    print 'Dumping Service config for ' + service.name
                    rcg = list()

                    for i in service.get_all_role_config_groups():
                        rcg.append(i.name)
                        f.write('[' + service.type + ']\n')
                        f.write('config_groups=' + ','.join(rcg))
                        f.write('\n\n')
                        f.write('[' + service.name + '-svc-config]\n')

                    for item in service.get_config():
                        for k,v in item.iteritems():
                            f.write(k + '=' + str(v) + '\n')

                    for i in service.get_all_role_config_groups():
                        f.write('\n')
                        f.write('[' + i.name + ']\n')
                        for k,v in i.get_config('full').iteritems():
                            if v.value is not None:
                                f.write(k + '=' + str(v.value) + '\n')
                    f.close()
        else:
            print 'Cluster config dir already exists.  Please rename or remove existing config dir: ' + conf_dir
Ejemplo n.º 19
0
def test():
    cm_host = 'xxx'
    api = ApiResource(cm_host,
                      username="******",
                      password="******",
                      use_tls=False,
                      version="12")

    # Get a list of all clusters
    cdh4 = None
    for c in api.get_all_clusters():
        print c.name
        if c.version == "CDH5":
            cdh4 = c


## -- Output --
# Cluster 1 - CDH4
# Cluster 2 - CDH3
Ejemplo n.º 20
0
def migrate_services(cm_host, cm_username, cm_password, old_node, new_node):
    uid = str(uuid.uuid4().hex)
    api = ApiResource(cm_host, username=cm_username, password=cm_password)
    cluster = api.get_all_clusters()[0]
    migrate_hdfs(cluster, new_node, old_node, uid, api)
    migrate_hue(cluster, new_node, old_node, uid)
    migrate_impala(cluster, new_node, old_node, uid)
    migrate_spark(cluster, new_node, old_node, uid)
    migrate_spark2(cluster, new_node, old_node, uid)
    migrate_hive(cluster, new_node, old_node, uid)
    migrate_oozie(cluster, new_node, old_node, uid)
    migrate_zookeeper(cluster, new_node, old_node, uid)
    migrate_sentry(cluster, new_node, old_node, uid)
    migrate_solr(cluster, new_node, old_node, uid)
    migrate_yarn(cluster, new_node, old_node, uid)
    migrate_arcadia(cluster, new_node, old_node, uid)
    print('Restarting cluster, please wait.....')
    time.sleep(30)
    cluster.restart().wait()
    print('Migration of Roles are completed')
Ejemplo n.º 21
0
Archivo: cma.py Proyecto: nmarian85/aws
def run_cdh(action, cdh_st, logger):
    component = "cdh"
    cfg = ConfigParser.ConfigParser()
    cfg.read("/home/ec2-user/aws/cdh.cfg")

    cmhost = cfg.get("CM", "host")
    user = cfg.get("CM", "username")
    passw = cfg.get("CM", "password")
    cafile = cfg.get("CM", "cafile")

    context = ssl.create_default_context(cafile=cafile)

    api = ApiResource(cmhost,
                      username=user,
                      password=passw,
                      ssl_context=context,
                      use_tls=True)
    allc = api.get_all_clusters()
    c = allc[0]
    run_process(c, action, cdh_st, component, logger)
Ejemplo n.º 22
0
def reset_cm(cm_host, cm_port, username, password):
    """Elim S3 config from CM API safety valve (service restart necessary)"""
    s3_props = set(get_s3_properties())
    cm_api = ApiResource(cm_host, username=username, password=password,
                         server_port=cm_port, version=9)
    cluster = list(cm_api.get_all_clusters())[0]
    hdfs = filter(lambda x: x.type == 'HDFS',
                  list(cluster.get_all_services()))[0]
    print("Getting current safety valve config")
    current_config = hdfs.get_config('full')[0]['core_site_safety_valve'].value
    # need the "<foo>...</foo>" to make it valid XML (bc it requires root elt)
    elts = list(fromstring('<foo>' + current_config + '</foo>'))
    new_elts = filter(lambda x: x.find('name').text not in s3_props, elts)
    print("Updating safety valve and deleting S3 config")
    _ = hdfs.update_config({
        'core_site_safety_valve': '\n'.join(tostring(e) for e in new_elts)})
    print("Deploying client config across the cluster...")
    cluster.deploy_client_config().wait()
    print("Restarting necessary services...")
    cluster.restart().wait()
    print("Done!")
Ejemplo n.º 23
0
def do_call(host, port, user, password, cluster_name, service_role_name, random_index):
    api = ApiResource(host, port, user, password, False, MAN_API_VERSION);
    for cluster in api.get_all_clusters():
        if cluster_name is None:
            break
        elif cluster_name == cluster.name:
            break
    if cluster_name is not None and cluster_name != cluster.name:
        print >> sys.stderr, "Cloud not find cluster: " + cluster_name
        return -2;
    do_print_header()
    for service in cluster.get_all_services():
        do_print_line_item(api, service, service_role_name, random_index, 'HDFS', 'NAMENODE', 'namenode_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'KUDU', 'KUDU_MASTER', 'webserver_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'HUE', 'HUE_SERVER', 'hue_http_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'HIVE', 'HIVESERVER2', 'hs2_thrift_address_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'IMPALA', 'IMPALAD', 'beeswax_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'FLUME', 'AGENT', 'agent_http_port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'KAFKA', 'KAFKA_BROKER', 'port', [], [])
        do_print_line_item(api, service, service_role_name, random_index, 'ZOOKEEPER', 'SERVER', 'clientPort', [], [])
    do_print_footer()
Ejemplo n.º 24
0
def main():
    API = ApiResource(CM_HOST,
                      version=16,
                      username=ADMIN_USER,
                      password=ADMIN_PASS)

    for c in API.get_all_clusters():
        if c.version == "CDH5":
            cdh5 = c

    for s in cdh5.get_all_services():
        restart_role(s, API)
    if (unhealthy_roles == []):
        print("ALL ROLES: OK")
    else:
        print("Following is the list of all unhealthy Roles:\n ")
        for role in unhealthy_roles:
            print("\n\t\t" + role)

    for s in cdh5.get_all_services():
        restart_service(s, API)
Ejemplo n.º 25
0
def get_cluster_info(manager_host, server_port=7180, username='******',
                     password='******'):
    cm_api = ApiResource(manager_host, username=username, password=password,
                         server_port=server_port, version=9)
    host = list(cm_api.get_all_hosts())[0]  # all hosts same instance type
    cluster = list(cm_api.get_all_clusters())[0]
    yarn = filter(lambda x: x.type == 'YARN',
                  list(cluster.get_all_services()))[0]
    hive = filter(lambda x: x.type == 'HIVE',
                  list(cluster.get_all_services()))[0]
    impala = filter(lambda x: x.type == 'IMPALA',
                    list(cluster.get_all_services()))[0]
    hive_hs2 = hive.get_roles_by_type('HIVESERVER2')[0]
    hive_host = cm_api.get_host(hive_hs2.hostRef.hostId).hostname
    hive_port = int(
        hive_hs2.get_config('full')['hs2_thrift_address_port'].default)
    impala_hs2 = impala.get_roles_by_type('IMPALAD')[0]
    impala_host = cm_api.get_host(impala_hs2.hostRef.hostId).hostname
    impala_port = int(impala_hs2.get_config('full')['hs2_port'].default)
    return {'num_worker_nodes': len(yarn.get_roles_by_type('NODEMANAGER')),
            'node_cores': host.numCores, 'node_memory': host.totalPhysMemBytes,
            'hive_host': hive_host, 'hive_port': hive_port,
            'impala_host': impala_host, 'impala_port': impala_port}
Ejemplo n.º 26
0
def adjust_yarn_memory_limits(region, stack_name):
    ec2_conn = create_ec2_connection(region)
    manager_instance = get_manager_instance(ec2_conn, stack_name)
    cm_api = ApiResource("localhost", username="******", password="******", server_port=64999, version=9)
    with http_tunnel_ctx(manager_instance, 7180, 64999):
        cluster = list(cm_api.get_all_clusters())[0]
        host = list(cm_api.get_all_hosts())[0]  # all hosts same instance type
        yarn = filter(lambda x: x.type == "YARN", list(cluster.get_all_services()))[0]
        rm_cg = filter(lambda x: x.roleType == "RESOURCEMANAGER", list(yarn.get_all_role_config_groups()))[0]
        nm_cg = filter(lambda x: x.roleType == "NODEMANAGER", list(yarn.get_all_role_config_groups()))[0]
        rm_cg.update_config(
            {
                "yarn_scheduler_maximum_allocation_mb": (int(host.totalPhysMemBytes / 1024.0 / 1024.0)),
                "yarn_scheduler_maximum_allocation_vcores": host.numCores,
            }
        )
        nm_cg.update_config(
            {
                "yarn_nodemanager_resource_memory_mb": (int(host.totalPhysMemBytes / 1024.0 / 1024.0)),
                "yarn_nodemanager_resource_cpu_vcores": host.numCores,
            }
        )
        cluster.deploy_client_config().wait()
        cluster.restart().wait()
def main():
    configfile=''

    if len(sys.argv) > 3 or len(sys.argv) < 3:
        print("Usage: %s -i configfile " % sys.argv[0])
        sys.exit(2)

    try:
        myopts, args = getopt.getopt(sys.argv[1:],"i:h")
    except getopt.GetoptError as e:
        print (str(e))
        print("Usage: %s -i configfile " % sys.argv[0])
        sys.exit(2)

    for o, a in myopts:
        if o == '-i':
            configfile=a
        elif o == '-h':
            print("Usage: %s -i configfile " % sys.argv[0])

    if os.path.isfile(configfile):
        print "processing configuration file...."
        pass
    else:
        print "file does not exist..."
        sys.exit(2)


    config = ConfigObj(configfile)
    cluster_name = config['cluster']['name']
    cdh_manager = config['cluster']['cdh_manager']
    cm_hostname = config['cluster']['cm_hostname']
    hostnames = config['cluster']['server_hostnames']
    services = config['cluster']['services']
    server_rack = config['cluster']['server_rack']
    server_login = config['cluster']['server_login']
    server_passwd = config['cluster']['server_passwd']
    server_key = config['cluster']['server_key']
    server_passphrase = config['cluster']['server_passphrase']
    cloudera_manager_repo = config['cluster']['cloudera_manager_repo']

    cm_host = cdh_manager
    api = ApiResource(cm_host, username="******", password="******")
    #print config['hive']['config']['hive_metastore_database_name']


    for c in api.get_all_clusters():
        if c.name == cluster_name:
            #cluster = c
            print "Cluster %s already exists " % (cluster_name)
            print "Please manually delete the cluster %s , all hosts and associated services." % (cluster_name)
            sys.exit(0)
        else:
            print "Starting the automation process..."

            pass


    cdhproc(cluster_name,api,hostnames,server_rack,server_login,server_passwd,server_key,server_passphrase,cloudera_manager_repo)
    createMGMT(api,cm_hostname,server_login,server_passwd,server_passphrase,server_key)
    deployHDFSMAP(cluster_name,api,configfile)

    if "yarn" in services:
        createYarn(cluster_name,api,configfile)
    if "zookeeper" in services:
        createZookeeper(cluster_name,api,configfile)
    if "hive" in services:
        createHive(cluster_name,api,configfile)
    if "hbase" in services:
        createHbase(cluster_name,api,configfile)
    if "spark" in services:
        createSpark(cluster_name,api,configfile)
    if "impala" in services:
        createImpala(cluster_name,api,configfile)

    cluster = api.get_cluster(cluster_name)

    print "Stopping cluster..."
    cmd = cluster.stop().wait()
    print "Active: %s. Success: %s" % (cmd.active, cmd.success)
    print "Starting cluster..."
    cmd =cluster.start().wait()
    print "Active: %s. Success: %s" % (cmd.active, cmd.success)

    if "solr" in services:
        createSolr(cluster_name,api,configfile)
    if "flume" in services:
        createFlume(cluster_name,api,configfile)
    if "oozie" in services:
        createOozie(cluster_name,api,configfile)
    if "sqoop" in services:
        createSqoop(cluster_name,api,configfile)
    if "hue" in services:
        createHue(cluster_name,api,configfile)

    #print "Stopping cluster..."
    #cmd = cluster.stop().wait()
    #print "Active: %s. Success: %s" % (cmd.active, cmd.success)
    #print "Starting cluster..."
    #cmd =cluster.start().wait
    #print "Active: %s. Success: %s" % (cmd.active, cmd.success)


    print "Cluster deployed successfully...."
    print "Login to: http://"+cdh_manager+":7180"
Ejemplo n.º 28
0
    def runner(self, args, display=True):
        values = []
        health_values = []

        plugin_args = args.split() \
                    if args is not None and (len(args.strip()) > 0) \
                    else ""

        options = self.read_args(plugin_args)

        if options.hadoopdistro == 'CDH':
            api = ApiResource(server_host=options.cmhost, \
                            server_port=options.cmport, \
                            username=options.cmuser, \
                            password=options.cmpassword, \
                            version=11)
            cluster = api.get_cluster(api.get_all_clusters()[0].name)
            cdh = CDHData(api, cluster)
        else:
            cdh = HDPData(options.cmhost, options.cmuser, options.cmpassword)
        hbase = None

        def run_test_sequence():
            # pylint: disable=too-many-return-statements
            hbase = happybase.Connection(host=cdh.get_hbase_endpoint())
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()

                try:
                    hbase.create_table('blackbox_test_table', {'cf': dict()})
                    logging.debug("test table created")
                except AlreadyExists:
                    logging.debug("test table exists")

                table = hbase.table('blackbox_test_table')
                end = TIMESTAMP_MILLIS()
                create_table_ok = True
                create_table_ms = end - start
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.create_table_time_ms", [],
                          create_table_ms))
            except:
                LOGGER.error(traceback.format_exc())
                create_table_ok = False
                reason = ['Create HBase table operation failed']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.create_table_succeeded", reason,
                      create_table_ok))

            #write some data to it
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                table.put('row_key', {'cf:column': 'value'})
                end = TIMESTAMP_MILLIS()
                write_hbase_ok = True
                write_hbase_ms = end - start
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.write_time_ms", [], write_hbase_ms))
            except:
                LOGGER.error(traceback.format_exc())
                write_hbase_ok = False
                reason = ['Failed to insert row in HBase table']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.write_succeeded", reason, write_hbase_ok))

            #read some data from it
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                row = table.row('row_key', columns=['cf:column'])
                end = TIMESTAMP_MILLIS()
                read_hbase_ms = end - start
                read_hbase_ok = row['cf:column'] == 'value'
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.read_time_ms", [], read_hbase_ms))
            except:
                LOGGER.error(traceback.format_exc())
                hbase_fix_output = subprocess.check_output([
                    'sudo', '-u', 'hbase', 'hbase', 'hbck', '-repair',
                    'blackbox_test_table'
                ])
                for line in hbase_fix_output.splitlines():
                    if 'Status:' in line or 'inconsistencies detected' in line:
                        LOGGER.debug(line)
                subprocess.check_output([
                    'sudo', '-u', 'hbase', 'hbase', 'zkcli', 'rmr',
                    '/hbase/table/blackbox_test_table'
                ])
                subprocess.check_output([
                    'sudo', '-u', 'hdfs', 'hadoop', 'fs', '-rm', '-r', '-f',
                    '-skipTrash', '/hbase/data/default/blackbox_test_table'
                ])
                read_hbase_ok = False
                reason = ['Failed to fetch row by row key from HBase']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.read_succeeded", reason, read_hbase_ok))

            #create some hive metadata
            reason = []
            if abort_test_sequence is True:
                return
            try:
                start = TIMESTAMP_MILLIS()
                hive = hive_api.connect(cdh.get_hive_endpoint())
                end = TIMESTAMP_MILLIS()
                hive.cursor().execute("DROP TABLE blackbox_test_table")
                connect_to_hive_ms = end - start
                connect_to_hive_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.connection_time_ms", [],
                          connect_to_hive_ms))
            except:
                LOGGER.error(traceback.format_exc())
                connect_to_hive_ok = False
                reason = ['Failed to connect to Hive Metastore']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.connection_succeeded", reason,
                      connect_to_hive_ok))

            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                hive.cursor().execute((
                    "CREATE EXTERNAL TABLE "
                    "blackbox_test_table (key STRING, value STRING)"
                    "STORED BY \"org.apache.hadoop.hive.hbase.HBaseStorageHandler\" "
                    "WITH SERDEPROPERTIES "
                    "(\"hbase.columns.mapping\" = \":key,cf:column\") "
                    "TBLPROPERTIES(\"hbase.table.name\" = \"blackbox_test_table\")"
                ))
                end = TIMESTAMP_MILLIS()
                create_metadata_ms = end - start
                create_metadata_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.create_metadata_time_ms", [],
                          create_metadata_ms))
            except:
                LOGGER.error(traceback.format_exc())
                create_metadata_ok = False
                reason = [
                    'CREATE EXTERNAL TABLE statement failed on Hive Metastore'
                ]
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.create_metadata_succeeded", reason,
                      create_metadata_ok))

            #read some data via impala using it
            if abort_test_sequence is True:
                return

            if cdh.get_impala_endpoint() is not None:
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    impala = connect(host=cdh.get_impala_endpoint(),
                                     port=options.impalaport)
                    end = TIMESTAMP_MILLIS()
                    impala.cursor().execute("invalidate metadata")
                    connect_to_impala_ms = end - start
                    connect_to_impala_ok = True
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                              "hadoop.IMPALA.connection_time_ms", [],
                              connect_to_impala_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    connect_to_impala_ok = False
                    reason = ['Failed to connect to Impala']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                          "hadoop.IMPALA.connection_succeeded", reason,
                          connect_to_impala_ok))

                if abort_test_sequence is True:
                    return
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    impala_cursor = impala.cursor()
                    impala_cursor.execute("SELECT * FROM blackbox_test_table")
                    table_contents = impala_cursor.fetchall()
                    end = TIMESTAMP_MILLIS()
                    read_impala_ms = end - start
                    read_impala_ok = table_contents[0][1] == 'value'
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                              "hadoop.IMPALA.read_time_ms", [],
                              read_impala_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    read_impala_ok = False
                    reason = ['Failed to SELECT from Impala']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('IMPALA'),
                          "hadoop.IMPALA.read_succeeded", reason,
                          read_impala_ok))
            else:
                reason = []
                try:
                    start = TIMESTAMP_MILLIS()
                    hive_cursor = hive.cursor()
                    hive_cursor.execute("SELECT * FROM blackbox_test_table")
                    table_contents = hive_cursor.fetchall()
                    end = TIMESTAMP_MILLIS()
                    read_hive_ms = end - start
                    read_hive_ok = table_contents[0][1] == 'value'
                    values.append(
                        Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'),
                              "hadoop.HQUERY.read_time_ms", [], read_hive_ms))
                except:
                    LOGGER.error(traceback.format_exc())
                    read_hive_ok = False
                    reason = ['Failed to SELECT from Hive']
                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HQUERY'),
                          "hadoop.HQUERY.read_succeeded", reason,
                          read_hive_ok))

            #delete metadata
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                hive.cursor().execute("DROP TABLE blackbox_test_table")
                end = TIMESTAMP_MILLIS()
                drop_metadata_ms = end - start
                drop_metadata_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                          "hadoop.HIVE.drop_table_time_ms", [],
                          drop_metadata_ms))
            except:
                LOGGER.error(traceback.format_exc())
                drop_metadata_ok = False
                reason = ['Failed to DROP table in Hive Metastore']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HIVE'),
                      "hadoop.HIVE.drop_table_succeeded", reason,
                      drop_metadata_ok))

            #delete hbase table
            if abort_test_sequence is True:
                return
            reason = []
            try:
                start = TIMESTAMP_MILLIS()
                # Disabled deleting table to work around apparent hbase bug (see VPP-17) but leaving
                # test step in so it can be easily re-enabled for testing.
                #hbase.disable_table('blackbox_test_table')
                #hbase.delete_table('blackbox_test_table')
                end = TIMESTAMP_MILLIS()
                drop_table_ms = end - start
                drop_table_ok = True
                values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                          "hadoop.HBASE.drop_table_time_ms", [],
                          drop_table_ms))
            except:
                LOGGER.error(traceback.format_exc())
                drop_table_ok = False
                reason = ['Failed to drop table in HBase']
            health_values.append(
                Event(TIMESTAMP_MILLIS(), cdh.get_name('HBASE'),
                      "hadoop.HBASE.drop_table_succeeded", reason,
                      drop_table_ok))

        def to_status(flag):
            '''
            Convert True to OK and False to ERROR
            '''
            if flag in [True, False]:
                status = 'OK' if flag is True else 'ERROR'
            else:
                status = flag

            return status

        def default_health_value(name, service, operation, failed_step):
            result = False
            if len([event for event in health_values
                    if event.metric == name]) == 0:
                if failed_step is not None:
                    message = 'Did not attempt to %s due to timeout waiting for: %s' % (
                        operation, failed_step)
                else:
                    message = 'Timed out waiting for %s to complete' % operation

                health_values.append(
                    Event(TIMESTAMP_MILLIS(), cdh.get_name(service), name,
                          [message], False))
                result = True
            return result

        test_thread = threading.Thread(target=run_test_sequence)
        test_thread.daemon = True
        abort_test_sequence = False
        test_thread.start()
        test_thread.join(60.0)
        abort_test_sequence = True
        if hbase is not None:
            hbase.close()

        failed_step = None
        if default_health_value("hadoop.HBASE.create_table_succeeded", "HBASE",
                                "create HBase table",
                                failed_step) and failed_step is None:
            failed_step = "create HBase table"
        if default_health_value("hadoop.HBASE.write_succeeded", "HBASE",
                                "write to HBase",
                                failed_step) and failed_step is None:
            failed_step = "write to HBase"
        if default_health_value("hadoop.HBASE.read_succeeded", "HBASE",
                                "read from HBase",
                                failed_step) and failed_step is None:
            failed_step = "read from HBase"
        if default_health_value("hadoop.HIVE.connection_succeeded", "HIVE",
                                "connect to Hive Metastore",
                                failed_step) and failed_step is None:
            failed_step = "connect to Hive Metastore"
        if default_health_value("hadoop.HIVE.create_metadata_succeeded",
                                "HIVE", "create Hive Metastore table",
                                failed_step) and failed_step is None:
            failed_step = "create Hive Metastore table"
        if cdh.get_impala_endpoint() is not None:
            if default_health_value("hadoop.IMPALA.connection_succeeded",
                                    "IMPALA", "connect to Impala",
                                    failed_step) and failed_step is None:
                failed_step = "connect to Impala"
            if default_health_value("hadoop.IMPALA.read_succeeded", "IMPALA",
                                    "SELECT from Impala",
                                    failed_step) and failed_step is None:
                failed_step = "SELECT from Impala"
        else:
            if default_health_value("hadoop.HQUERY.read_succeeded", "HQUERY",
                                    "SELECT from Hive",
                                    failed_step) and failed_step is None:
                failed_step = "SELECT from Hive"
        if default_health_value("hadoop.HIVE.drop_table_succeeded", "HIVE",
                                "DROP table in Hive Metastore",
                                failed_step) and failed_step is None:
            failed_step = "DROP table in Hive Metastore"
        if default_health_value("hadoop.HBASE.drop_table_succeeded", "HBASE",
                                "drop table in HBase",
                                failed_step) and failed_step is None:
            failed_step = "drop table in HBase"

        cdh_status_indicators = cdh.get_status_indicators()
        health_values.extend(cdh_status_indicators)
        overall = {}
        for health_val in health_values:
            try:
                current = overall[health_val.source]
                current_val = to_status(current.value)
                current_causes = current.causes
            except KeyError:
                current_val = 'OK'
                current_causes = []

            update = to_status(health_val.value)

            # If current is ERROR, output is ERROR, regardless
            # If current is WARN, output is WARN if update is OK but ERROR if further WARN or ERROR
            # If update is OK, output is OK if OK, WARN if WARN and ERROR if ERROR

            out = 'ERROR'
            if current_val != "ERROR":
                if current_val == 'WARN':
                    if update == 'OK':
                        out = 'WARN'
                if current_val == 'OK':
                    out = update
            current_val = out
            current_causes.extend(health_val.causes)

            overall[health_val.source] = Event(
                health_val.timestamp, health_val.source,
                'hadoop.%s.health' % cdh.get_type(health_val.source),
                current_causes, current_val)

        values.extend(health_values)
        values.extend(overall.values())

        if display:
            self._do_display(values)

        return values
Ejemplo n.º 29
0
def get_clusters():
    api = ApiResource(get_cm_host(), username=CM_USERNAME, password=CM_USER_PASSWORD, version=2)
    return api.get_all_clusters()
Ejemplo n.º 30
0
CONFIG = ConfigParser.ConfigParser()
CONFIG.read('clouderaconfig.ini')
cm_host = CONFIG.get("CM", 'cm.host')
username = CONFIG.get("CM", 'admin.name')
password = CONFIG.get("CM", 'admin.password')
cluster_name = CONFIG.get("CM", 'cluster.name')
master_nodes = CONFIG.get("CDH", 'cluster.masternodes').split(',')
slave_nodes = CONFIG.get("CDH", 'cluster.slavenodes').split(',')
edge_nodes = CONFIG.get("CDH", 'cluster.edgenodes').split(',')

api = ApiResource(cm_host, username=username, password=password)

# Connect with the Cluster
CLUSTER = None
for cluster in api.get_all_clusters():
    #print c.name
    CLUSTER = cluster

#Download and activate Kafka parcel
PARCEL = None
PARCEL_PRODUCT = None
PARCEL_VERSION = None
for p in CLUSTER.get_all_parcels():
    #    print p
    #    print p.product
    #    print p.version
    if p.product == "KAFKA":
        PARCEL = p
        PARCEL_PROCUCT = p.product
        PARCEL_VERSION = p.version
Ejemplo n.º 31
0
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from cm_api.api_client import ApiResource

cloudera_user = '******'
cloudera_pass = '******'
cm_host = "localhost"
api = ApiResource(cm_host,
                  username=cloudera_user, password=cloudera_pass,  # nosec
                  version=17)

c = api.get_all_clusters()[0]
services = c.get_all_services()


def process_service(service):
    service_name = service.name
    if service_name == "spark_on_yarn":
        service_name = "spark"
    for role_cfgs in service.get_all_role_config_groups():
        role_cm_cfg = role_cfgs.get_config(view='full')
        role_cfg = parse_config(role_cm_cfg)
        role_name = role_cfgs.roleType.lower()
        write_cfg(role_cfg, '%s-%s.json' % (service_name, role_name))

    service_cm_cfg = service.get_config(view='full')[0]
    service_cfg = parse_config(service_cm_cfg)
Ejemplo n.º 32
0
def main():
    global ec2con
    global cwcon

    ec2con = boto.ec2.connect_to_region('us-east-1')
    cwcon = boto.ec2.cloudwatch.CloudWatchConnection()

    api = ApiResource(CM_HOST, username="******", password="******")

    displayName = None
    for c in api.get_all_clusters():
        displayName = c.displayName
        print "Cluster: %s (%s)" % (displayName, c.name)
    
    inst_cache = {}

    insts = api.get_all_hosts('full')
    print "Found %s in the cluster" % [inst.hostId for inst in insts.objects]
    for inst in insts.objects:
        clusterName =  inst.roleRefs[0].clusterName
        if clusterName <> c.name:
            print 'Clusters do not correspond: %s vs %s' % (clusterName, c.name)
            continue

        cores = inst.numCores
        inst_id = inst.hostId
        inst_cache[inst_id] = my_cache =  {}
        # For later - we'll send in one data point for every TS query
        # that has AWS data
        my_cache['aws_info_recorded'] = False
        # my_cache['healthSummary'] = inst.healthSummary

        ress = ec2con.get_all_reservations(filters={'instance-id' : inst_id})
        if len(ress) > 0:
            print "Found %s reservations for %s: %s" % (len(ress), inst_id, ress)
        res = ress[0]

        instances = res.instances
        if len(instances) > 1:
            print "Found %s instances for %s %s" % (len(instances), inst_id, instances)
        inst = instances[0]
        if inst.id <> inst_id:
            raise Exception("%s != %s" % (inst.id, inst_id))

        platform = inst.platform
        vpc_id = inst.vpc_id

        if platform == 'windows':
            product = 'Windows'
        elif not platform:
            product = 'Linux_UNIX'
        else:
            product = 'UNKNOWN'
        if vpc_id:
            product += "_Amazon_VPC"

        ami = inst.image_id

        my_cache['product'] = product
        my_cache['region'] = inst.region.name
        my_cache['zone'] = inst.placement
        inst_type = inst.instance_type.replace('.','_')

        my_cache['inst_type'] = inst_type
        
        time_f =  arrow.utcnow().replace(minutes=common.DEFAULT_LOOKBACK_MINUTES)
        time_t = arrow.utcnow()
        # TODO
        # http://arr.gr/blog/2013/08/monitoring-ec2-instance-memory-usage-with-cloudwatch/
        # http://blog.sciencelogic.com/netflix-steals-time-in-the-cloud-and-from-users/03/2011
        # https://www.stackdriver.com/cpu-steal-why-aws-cloudwatch-metrics-are-different-than-agent-metrics/
        stat = cwcon.get_metric_statistics(300,
                                           time_f,
                                           time_t,
                                           'CPUUtilization',
                                           'AWS/EC2',
                                           ['Average','Minimum','Maximum'],
                                           { 'InstanceId' : inst_id })     
            # [{u'Timestamp': datetime.datetime(2014, 4, 13, 6, 5), u'Average': 0.35250000000000004, u'Minimum': 0.33, u'Maximum': 0.42, u'Unit': u'Percent'}]
        print 'Fetching stats for %s: %s' % (inst_id, stat)
        if stat:
            for s in stat:
                ts = common.ts_from_aws(s)
                my_cache['avg_cpu'] = float(s['Average'])
        else:
            print "No stats found for %s" % inst_id
    print "Querying CDH."
    series = api.query_timeseries('SELECT * WHERE clusterName = %s'  % c.name)
    for entry in series.objects[0].timeSeries:
        # print entry.metadata.__dict__
        metric = entry.metadata.metricName
        # internal host
        hostname = ""
        if 'hostname' in entry.metadata.attributes:
            host = entry.metadata.attributes['hostname']
            
        inst_id = ""
        my_cache = {}

        if 'hostId' in entry.metadata.attributes:
            inst_id = entry.metadata.attributes['hostId']
            if inst_id not in my_cache:
                print "Cannot find %s in %s" % (inst_id, inst_cache)
            my_cache = inst_cache[inst_id]
        service_name = ""
        if 'serviceName' in entry.metadata.attributes:
            service_name = entry.metadata.attributes['serviceName']
        service_type = ""
        if 'serviceType' in entry.metadata.attributes:
            service_type= entry.metadata.attributes['serviceType']
        role_type = ""
        if 'roleType' in entry.metadata.attributes:
            role_type = entry.metadata.attributes['roleType']

        
        num = entry.metadata.unitNumerators
        denom = entry.metadata.unitDenominators
        if len(num) > 1:
            print "Num:" + num
        if len(denom)>1:
            print "Denom:" + denom
        unit = num[0]
           
        if len(denom) > 0:
            unit += denom[0]
        tags = {
            'cdh_service_name_service_type_role_type' : "%s.%s.%s" % (
                service_name,
                service_type,
                role_type),
            'unit' : unit
            }
        
        combined_tags = deepcopy(tags)
        if my_cache:
            # combined_tags['healthSummary']= my_cache['healthSummary']
            combined_tags['inst_type'] = my_cache['inst_type']
            combined_tags['cloud'] = 'aws'
            combined_tags['region'] = my_cache['region']
            combined_tags['zone'] = my_cache['zone']
            combined_tags['product'] = my_cache['product']
            
        if not entry.data:
            continue
        
        for sample in entry.data:
            ts = arrow.Arrow.fromdatetime(sample.timestamp).timestamp
            val = sample.value
            if len(combined_tags) > 8:
                print "ERROR: Too many tags: %s" % combined_tags
                sys.exit(0)
            common.otsdb_send(metric, val, combined_tags, ts, False)
            # Do the AWS once only
            if my_cache and not my_cache['aws_info_recorded']:
                # print my_cache
                combined_tags['unit'] = 'percent'
                if 'avg_cpu' in my_cache:
                    common.otsdb_send('aws_average_cpu_utilization', 
                                      my_cache['avg_cpu'],
                                      combined_tags, 
                                      my_cache['ts'], 
                                      False)
Ejemplo n.º 33
0
# In your virtual environment do:
# 	pip install cm-api
import sys
from cm_api.api_client import ApiResource

cm_host = "localhost"
api = ApiResource(cm_host, username="******", password="******")

print "*** CLUSTERS ***"

clusters = None
# List clusters
for c in api.get_all_clusters():
    print "Cluster \"%s\" is version %s" % (c.name, c.version)
    clusters = c

print "*** HOSTS ***"

for host_ref in c.list_hosts():
    host = api.get_host(host_ref.hostId)
    print host.hostname

print "*** SERVICES ***"

hdfs = None
# List services & health info
for s in clusters.get_all_services():
  print "Service \"%s\" -- state \"%s\" -- health \"%s\"" %(s.name, s.serviceState, s.healthSummary)
  # Get HDFS service
  if 'hdfs' in s.type.lower():
    hdfs = s
Ejemplo n.º 34
0
class handler_cm_api:
    def __init__(self):
        self._user_executing = grp.getgrnam(getpass.getuser())[0]

    def __getitem__(self):
        return self

    def setup(self,
              p_cm_host,
              p_cm_user,
              p_cm_pass,
              p_cm_version,
              p_cluster,
              p_cm_port=None,
              p_use_tls=False):
        self.cm_api = ApiResource(p_cm_host,
                                  server_port=p_cm_port,
                                  version=p_cm_version,
                                  username=p_cm_user,
                                  password=p_cm_pass,
                                  use_tls=p_use_tls)
        handler_cm_api.cluster_hosts = self.cm_api.get_all_hosts()
        if p_cluster:
            self.cluster = filter(lambda x: x.displayName == p_cluster,
                                  self.cm_api.get_all_clusters())[0]
            if not self.cluster:
                print("Error: That cluster is not valid.")
                return
            else:
                self.services = self.cluster.get_all_services()
                self.name = self.cluster.displayName

        tmp_topology = self.cluster.list_hosts()
        self.topology = {}

        for i in range(len(tmp_topology)):
            tmp_host = filter(lambda x: x.hostId == tmp_topology[i].hostId,
                              handler_cm_api.cluster_hosts)[0]
            self.topology[tmp_topology[i].hostId] = tmp_host.hostname

    def get_current_group(self):
        return self._user_executing

###############################
# For internal validations

    def __validate_service(self, p_service):
        v_service = filter(lambda x: x.type == p_service, self.services)

        if not v_service:
            print("Error: Service not found")
            raise SystemExit

        return v_service.pop()

    def __validate_hostname(self, p_hostname):
        v_node = filter(lambda x: x.hostname == p_hostname,
                        handler_cm_api.cluster_hosts)
        if not v_node:
            print("Error: Hostname not found")
            raise SystemExit

        return v_node.pop()

    def __validate_role(self, p_service, p_role, p_hostname):
        v_service = self.__validate_service(p_service)
        v_node = self.__validate_hostname(p_hostname)
        v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles())
        v_role = filter(lambda x: x.hostRef.hostId == v_node.hostId, v_roles)

        if not v_role:
            print("Error: Role not found in that host")
            raise SystemExit

        return v_role.pop()

######################################################################
# START/STOP/RESTART
######################################################################

    def stop_cluster(self):
        v_cmd = self.cluster.stop()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

    def start_cluster(self):
        v_cmd = self.cluster.start()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

    def restart_cluster(self):
        v_cmd = self.cluster.restart()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

    def rolling_restart_cluster(self):
        v_cmd = self.cluster.rolling_restart()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

######################################################################
#SERVICES
######################################################################
################
# Status
################
# ------ State

    def check_state_services(self):
        for v_srv in self.services:
            print(coloring(v_srv.serviceState, v_srv.type))

    def check_state_service(self, p_service):
        v_service = self.__validate_service(p_service)
        print(coloring(v_service.serviceState, v_service.type))

    def check_health_services(self):
        for v_srv in self.services:
            print(coloring(v_srv.healthSummary, v_srv.type))

# ----- Health

    def check_health_service(self, p_service):
        v_service = self.__service_validate(p_service)
        print(coloring(v_service.healthSummary, v_service.type))

#####################################
# stop/start/restart/Rolling Restart
#####################################

    def stop_service(self, p_service):
        v_service = self.__validate_service(p_service)
        print("* Stopping " + v_service.type)
        v_cmd = v_service.stop()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

    def start_service(self, p_service):
        v_service = self.__validate_service(p_service)
        print("* Starting " + v_service.type)
        v_cmd = v_service.start()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

    def restart_service(self, p_service):
        v_service = self.__validate_service(p_service)
        print("* Restarting " + v_service.type)
        v_cmd = v_service.restart()
        v_msg = f_waiting_task(v_cmd)
        print(coloring(*v_msg))

    def rolling_restart_service(self, p_service):
        v_service = self.__validate_service(p_service)
        try:
            print(" * Rolling Restarting " + v_service.type)
            v_cmd = v_service.rolling_restart()
            v_msg = f_waiting_task(v_cmd)
            print(coloring(*v_msg))
        except:
            if re.match("Command not valid for", str(sys.exc_info()[1])):
                print "It's not possible to use Rolling Restart in this service."
            else:
                raise

###################################################################
# ROLES
###################################################################
#################
# Status
#################

# ---- State

    def check_state_roles(self, p_service):
        v_service = self.__validate_service(p_service)
        print("*" + v_service.type + ":")
        for v_role in v_services.get_all_roles():
            print(
                coloring(
                    v_role.roleState,
                    filter(lambda x: x.hostId == v_role.hostRef.hostId,
                           handler_cm_api.cluster_hosts)[0].hostname) + ":\t" +
                v_role.type)

    def check_state_role(self, p_service, p_role):
        v_service = self.__validate_service(p_service)
        print("*" + v_service.type + ":")
        v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles())
        for v_role in v_roles:
            print(
                coloring(
                    v_role.roleState,
                    filter(lambda x: x.hostId == v_role.hostRef.hostId,
                           handler_cm_api.cluster_hosts)[0].hostname) + ":\t" +
                v_role.type)

    def check_state_all_roles(self):
        for v_service in self.services:
            self.check_state_roles(v_service.type)
            print('---------------------')

# ---- Health

    def check_health_roles(self, p_service):
        v_service = self.__validate_service(p_service)
        print("*" + v_service.type + ":")
        for v_role in v_service.get_all_roles():
            print(
                coloring(
                    v_role.healthSummary,
                    filter(lambda x: x.hostId == v_role.hostRef.hostId,
                           handler_cm_api.cluster_hosts)[0].hostname) + ":\t" +
                v_role.type)

    def check_health_role(self, p_service, p_role):
        v_service = self.__validate_service(p_service)
        print("*" + v_service.type + ":")
        v_roles = filter(lambda x: x.type == p_role, v_service.get_all_roles())
        for v_role in v_roles:
            print(
                coloring(
                    v_role.healthSummary,
                    filter(lambda x: x.hostId == v_role.hostRef.hostId,
                           handler_cm_api.cluster_hosts)[0].hostname) + ":\t" +
                v_role.type)

    def check_health_all_roles(self):
        for v_service in self.services:
            self.check_health_roles(v_service.type)
            print('---------------------')

#####################
# Stop/Start/Restart

    def stop_role(self, p_service, p_role, p_hostname):
        v_service = self.__validate_service(p_service)
        v_node = self.__validate_hostname(p_hostname)
        v_role = self.__validate_role(p_service, p_role, p_hostname)

        print("* Stopping " + v_role.type)
        v_cmd = v_service.stop_roles(v_role.name)
        v_msg = f_waiting_task(v_cmd[0])
        print(coloring(*v_msg))

    def start_role(self, p_service, p_role, p_hostname):
        v_service = self.__validate_service(p_service)
        v_node = self.__validate_hostname(p_hostname)
        v_role = self.__validate_role(p_service, p_role, p_hostname)

        print("* Starting " + v_role.type)
        v_cmd = v_service.start_roles(v_role.name)
        v_msg = f_waiting_task(v_cmd[0])
        print(coloring(*v_msg))

    def restart_role(self, p_service, p_role, p_hostname):
        v_service = self.__validate_service(p_service)
        v_node = self.__validate_hostname(p_hostname)
        v_role = self.__validate_role(p_service, p_role, p_hostname)

        print("* restarting " + v_role.type)
        v_cmd = v_service.restart_roles(v_role.name)
        v_msg = f_waiting_task(v_cmd[0])
        print(coloring(*v_msg))

###########################################################
#IMPALA QUERIES
###########################################################
# FILTERS
############################

    def setup_filters_impala_queries(self):
        v_start_time = raw_input(
            'Introduce the start time with following format: DD/MM/YYYY_hh:mm:ss. Example: 01/01/2018_00:00:00: '
        )
        if not re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$",
                        v_start_time):
            print("Error: Invalid Format for start time")
            return

        v_end_time = raw_input(
            'Introduce the end time with the following format: DD/MM/YYYY_hh:mm:ss. Example 31/01/2018_00:00:00: '
        )
        if not re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", v_end_time):
            print("Error: Invalid format for end time")
            return

        v_filter_type = raw_input(
            'Choose the kind of filter: user|duration|state: ')
        if not v_filter_type in ('user', 'duration', 'state'):
            print("Error: Invalid kind of filter")
            return

        if v_filter_type == 'user':
            v_filter_value = raw_input(
                'Introduce the user name you want to filter by: ')
            if not v_filter_value:
                print("Error: Invalid user name")
                return

        elif v_filter_type == 'duration':
            v_filter_value = raw_input(
                'Introduce the query duration you want to filter by: +Xs|-Xs|=Xs. Example: +0s: '
            )
            if not re.match("^[+-=]\d+.\d*[hms]$", v_filter_value):
                print("Error: Invalid duration filter.")
                return

        elif v_filter_type == 'state':
            v_filter_value = raw_input(
                'Introduce the query state you want to filter by: CREATED|INITIALIZED|COMPILED|RUNNING|FINISHED|EXCEPTION|UNKNOWN: '
            )
            if not v_filter_value in ('CREATED', 'INITIALIZED', 'COMPILED',
                                      'RUNNING', 'FINISHED', 'EXCEPTION',
                                      'UNKNOWN'):
                print("Error: Invalid state filter.")
                return

        v_limit = raw_input(
            "Introduce the max num of queries you want to check: ")
        if not re.match("^\d+$", v_limit):
            print("Error: Invalid limit. It has to be an integer")
            return

        return v_start_time, v_end_time, v_filter_type, v_filter_value, int(
            v_limit)

######################################
# Getting queries
######################################

    def get_impala_queries(self,
                           p_start_time=None,
                           p_end_time=None,
                           p_filter_type=None,
                           p_filter_value=None,
                           p_limit=None):
        if not (p_start_time and p_end_time and p_filter_type
                and p_filter_value and p_limit):
            p_start_time, p_end_time, p_filter_type, p_filter_value, p_limit = self.setup_filters_impala_queries(
            )

        v_impala = filter(lambda x: x.type == 'IMPALA', self.services)[0]

        if not v_impala:
            print("Error: Impala service doesnt exist in this cluster.")
            return

        if re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", p_start_time):
            v_start_time = datetime.strptime(p_start_time, '%d/%m/%Y_%H:%M:%S')
        else:
            print("Error. startTime format is not valid.")
            return

        if re.match("^\d{2}/\d{2}/20\d{2}_\d{2}:\d{2}:\d{2}$", p_start_time):
            v_end_time = datetime.strptime(p_end_time, '%d/%m/%Y_%H:%M:%S')
        else:
            print("Error. startTime format is not valid.")
            return

        if p_filter_type == "user" and type(p_filter_value) == str:
            v_filter_str = 'user = '******'+':
                v_filter_value = p_filter_value.replace('+', '>')
            if p_filter_value[0] == '-':
                v_filter_value = p_filter_value.replace('-', '<')
            v_filter_str = 'queryDuration ' + v_filter_value

        elif p_filter_type == "state" and p_filter_value in (
                'CREATED', 'INITIALIZED', 'COMPILED', 'RUNNING', 'FINISHED',
                'EXCEPTION', 'UNKNOWN'):
            v_filter_str = 'queryState = ' + v_filter_value

        else:
            print("Error: Filter is not valid.")
            return

        if type(p_limit) == int and p_limit < 201: v_limit = p_limit
        else:
            print("Error: Limit is not valid. It must be > 0 and <= 200")
            return

        v_queries = v_impala.get_impala_queries(v_start_time, v_end_time,
                                                v_filter_str, v_limit).queries

        v_output = ''
        for vq in v_queries:
            v_coordinator = filter(lambda x: x.hostId == vq.coordinator.hostId,
                                   self.cluster_hosts)[0].hostname

            v_output += COLORS.BLUE + "##################################################################################" + COLORS.RESET + "\n"
            v_output += vq.queryId + " -- " + vq.queryState + ":\n"
            v_output += COLORS.RED + vq.statement + COLORS.RESET + "\n"
            v_output += COLORS.GREEN + "--- Attributes ---" + COLORS.RESET + "\n"
            v_output += "Query Type: " + vq.queryType + "\n"
            if 'query_status' in vq.attributes.keys():
                v_output += "Query Status: " + vq.attributes[
                    'query_status'] + "\n"

            v_output += "User: "******"\n"
            v_output += "Database: " + vq.database + "\n"
            if 'pool' in vq.attributes.keys():
                v_output += "Pool: " + vq.attributes['pool'] + "\n"

            v_output += "Starts at: " + vq.startTime.strftime(
                "%d/%m/%Y_%H:%M:%S") + "\n"
            v_output += "Ends at: " + vq.endTime.strftime(
                "%d/%m/%Y_%H:%M:%S") + "\n"
            v_output += "Coordinator: " + v_coordinator + "\n"
            v_output += "Rows Produced: " + str(vq.rowsProduced) + "\n"

            if vq.attributes['file_formats']:
                v_output += "File Format: " + vq.attributes[
                    'file_formats'] + "\n"
            if 'hdfs_bytes_read' in vq.attributes.keys():
                v_output += "HDFS bytes read: " + vq.attributes[
                    'hdfs_bytes_read'] + "\n"
            if 'memory_aggregate_peak' in vq.attributes.keys():
                v_output += "Memory Aggregate Peak: " + vq.attributes[
                    'memory_aggregate_peak'] + "\n"
            if 'thread_cpu_time' in vq.attributes.keys():
                v_output += "Threads Cpu Time: " + vq.attributes[
                    'thread_cpu_time'] + "\n"

        print(v_output)
        print("Do you want to save the output? (Y/N)")
        v_save = raw_input("Your choice: ").upper()
        if v_save == 'Y':
            v_output_nc = re.sub("\\x1b\[\d+m", "", v_output)
            v_file = "/tmp/impala_queries_" + datetime.now().strftime(
                "%Y%m%d_%H%M%S") + ".log"
            with open(v_file, 'a') as file_output:
                file_output.write(v_output_nc)
            print("The output was written in: " + v_file)

######################
# Getting details
######################

    def get_details_impala_query(self, p_query_id=None):
        if not p_query_id:
            v_query_id = raw_input(
                'Introduce the query id you want to check the details: ')
        else:
            v_query_id = p_query_id

        v_impala = filter(lambda x: x.type == 'IMPALA', self.services)[0]
        v_queries = v_impala.get_impala_queries(
            datetime.now() - timedelta(days=30), datetime.now(),
            'queryDuration > 0s', 1000).queries

        v_query = filter(lambda x: x.queryId == v_query_id, v_queries)
        if not v_query:
            print(
                "Error: The query_id is not valid, was executed more than 30 days ago or is not between the last 1000 queries. 1000 is the limit."
            )
            return
        elif not v_query[0].detailsAvailable:
            print("Error: This Query does not have details available.")
            return
        else:
            v_output = "/tmp/impala_query_details_" + v_query[
                0].queryId + "_" + datetime.now().strftime(
                    "%Y%m%d_%H%M%S") + ".log"
            with open(v_output, 'a') as file_output:
                file_output.write(
                    str(v_impala.get_query_details(v_query[0].queryId)))
            print("The output was written in: " + v_output)

#######################

    def get_same_configuration(self):
        v_configs = []
        v_command = 'hadoop org.apache.hadoop.conf.Configuration'

        for v_node in self.topology.values():
            v_ssh = subprocess.Popen(
                ["ssh", v_node, "-o", "StrictHostKeyChecking=no", v_command],
                stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
            v_configs += [v_ssh.stdout.readlines()]

        if len(self.topology) != len(v_configs):
            print(
                "Error: The num configs is different to the num of nodes in this cluster"
            )
            return

        if v_configs[1:] == v_configs[:-1]:
            print(coloring('GOOD', "The configs are the same in all nodes."))
            print("The nodes which were checked are: " +
                  ', '.join(self.topology.values()))

        else:
            print(coloring('BAD', "The configs are not the same."))
class RemoteDataLoad(object):
    """This is an implementation of the process to load a test-warehouse snapshot on
    a remote CM managed cluster. This script assumes that the warehouse snapshot was
    already downloaded and was either passed in as a parameter, or can be found by
    either inspecting the SNAPSHOT_DIR environment variable, or based on the WORKSPACE
    environment variable on a Jenkins build slave.

    The reason for the additional setup code is that in the local development
    environment it is assumed that $USER is HDFS superuser, which is not the case for
    remote deloyments.
    """

    def __init__(self, cm_host, options):
        logger.info("Starting remote data load...")
        self.options = options
        self.cm_host = cm_host

        # Gateway host can be used if the CM host is not configured as a Hadoop gateway
        self.gateway = options.gateway if options.gateway else cm_host
        self.impala_home = os.environ["IMPALA_HOME"]
        self.api = ApiResource(self.cm_host, username=options.cm_user,
                               password=options.cm_pass)

        # The API returns a list of clusters managed by the CM host. We're assuming
        # that this CM host was set up for the purpose of Impala testing on one
        # cluster, so the list should only have one value.
        self.cluster = self.api.get_all_clusters()[0]
        self.services = self.get_services()

        self.config = self.get_service_client_configurations()
        logger.info("Retrieved service configuration")
        logger.info(str(self.config))
        self.prepare()
        logger.info("IMPALA_HOME: {0}".format(self.impala_home))

    def get_hostname_for_ref(self, host_ref):
        """Translate the HostRef instance into the hostname."""
        return self.api.get_host(host_ref.hostId).hostname

    @staticmethod
    def get_or_default(config):
        return config.value if config.value else config.default

    def get_services(self):
        """Confirm that all services are running, and return service dict."""
        services = dict((s.type, s) for s in self.cluster.get_all_services())

        if set(REQUIRED_SERVICES) != set(services.keys()):
            missing_services = set(REQUIRED_SERVICES) - set(services.keys())
            logger.error("Services not installed: {0}".format(list(missing_services)))
            raise RuntimeError("Cluster not ready.")

        if not all(services[s].serviceState == 'STARTED' for s in services):
            stopped = [s for s in services if services[s].serviceState != "STARTED"]
            logger.error("Not all services started: {0}".format(stopped))
            raise RuntimeError("Cluster not ready.")

        return services

    @timing
    def download_client_config(self, cluster, service):
        """Download the client configuration zip for a particular cluster and service.

        Since cm_api does not provide a way to download the archive we build the URL
        manually and download the file. Once it downloaded the file the archive is
        extracted and its content is copied to the Hadoop configuration directories
        defined by Impala.
        """
        logger.info("Downloading client configuration for {0}".format(service.name))
        url = "http://{0}:7180/api/{1}/clusters/{2}/services/{3}/clientConfig".format(
            self.cm_host, CM_API_VERSION, urlquote(cluster.name), urlquote(service.name))
        path = mkdtemp()
        sh.curl(url, o=os.path.join(path, "clientConfig.zip"), _out=tee, _err=tee)
        current = os.getcwd()
        os.chdir(path)
        sh.unzip("clientConfig.zip")
        for root, _, file_names in os.walk("."):
            for filename in fnmatch.filter(file_names, "*.xml"):
                src = os.path.join(root, filename)
                dst = os.path.join(self.impala_home, "fe", "src", "test", "resources")
                logger.debug("Copying {0} to {1}".format(src, dst))
                shutil.copy(src, dst)
        os.chdir(current)

    # TODO: this may be available in tests/comparison/cluster.py
    def set_hive_warehouse_dir(self, cluster, service):
        logger.info("Setting the Hive Warehouse Dir")
        for service in self.api.get_all_clusters()[0].get_all_services():
            logger.info(service)
            if service.type == "HIVE":
              hive_config = { "hive_warehouse_directory" : HIVE_WAREHOUSE_DIR }
              service.update_config(hive_config)

    # TODO: This functionality should be more generally available to other infrastructure
    # code, rather than being quarantined in this script. See IMPALA-4367.
    @timing
    def get_service_client_configurations(self):
        """Download the client configurations necessary to upload data to the remote
        cluster. Unfortunately, the CM API does not allow downloading it so we have to
        iterate over the services and download the config for all of them.

        In addition, returns an options dictionary with settings required for data loading
        like the HS2 server, Impala hosts, Name node etc.

        Returns:
            A client-configuration dictionary, e.g.:

            {
                'hive_warehouse_directory': '/test-warehouse',
                'hs2': 'impala-test-cluster-1.gce.cloudera.com:10000',
                'impalad': ['impala-test-cluster-4.gce.cloudera.com:21000',
                            'impala-test-cluster-2.gce.cloudera.com:21000',
                            'impala-test-cluster-3.gce.cloudera.com:21000'],
                'metastore': 'impala-test-cluster-1.gce.cloudera.com:9083',
                'namenode': 'impala-test-cluster-1.gce.cloudera.com',
                'namenode_http': 'impala-test-cluster-1.gce.cloudera.com:20101',
                'kudu_master': 'impala-test-cluster-1.gce.cloudera.com'
            }
        """
        # Iterate overs services and find the information we need
        result = {}
        for service_type, service in self.services.iteritems():
            if service_type == "IMPALA":
                roles = service.get_roles_by_type("IMPALAD")
                impalads = []
                for r in roles:
                    rc_config = r.get_config("full")
                    hostname = self.get_hostname_for_ref(r.hostRef)
                    hs2_port = self.get_or_default(rc_config["beeswax_port"])
                    impalads.append("{0}:{1}".format(hostname, hs2_port))
                    result["impalad"] = impalads
            elif service_type == "HBASE":
                self.download_client_config(self.cluster, service)
            elif service_type == "HDFS":
                self.download_client_config(self.cluster, service)
                role = service.get_roles_by_type("NAMENODE")
                config = role[0].get_config("full")
                namenode = self.get_hostname_for_ref(role[0].hostRef)
                result["namenode"] = namenode
                result["namenode_http"] = "{0}:{1}".format(
                    namenode,
                    self.get_or_default(config["dfs_http_port"])
                )
            elif service_type == "HIVE":
                self.set_hive_warehouse_dir(self.cluster, service)
                self.download_client_config(self.cluster, service)
                hs2 = service.get_roles_by_type("HIVESERVER2")[0]
                rc_config = hs2.get_config("full")
                result["hive_warehouse_directory"] = self.get_or_default(
                    service.get_config("full")[0]["hive_warehouse_directory"])
                hostname = self.get_hostname_for_ref(hs2.hostRef)
                result["hs2"] = "{0}:{1}".format(hostname, self.get_or_default(
                    rc_config["hs2_thrift_address_port"]))

                # Get Metastore information
                ms = service.get_roles_by_type("HIVEMETASTORE")[0]
                rc_config = ms.get_config("full")
                result["metastore"] = "{0}:{1}".format(
                    self.get_hostname_for_ref(ms.hostRef),
                    self.get_or_default(rc_config["hive_metastore_port"])
                )
            elif service_type == "KUDU":
                # Service KUDU does not require a client configuration
                result["kudu_master"] = self.cm_host

        return result

    # TODO: This functionality should be more generally available to other infrastructure
    # code, rather than being quarantined in this script. See IMPALA-4367.
    @staticmethod
    def find_snapshot_file(snapshot_dir):
        """Given snapshot_directory, walks the directory tree until it finds a file
        matching the test-warehouse archive pattern."""
        for root, _, file_names in os.walk(snapshot_dir):
            for filename in fnmatch.filter(file_names, "test-warehouse-*-SNAPSHOT.tar.gz"):
                logger.info("Found Snapshot file {0}".format(filename))
                return os.path.join(root, filename)

    @timing
    def prepare(self):
        """Populate the environment of the process with the necessary values.

        In addition, it creates helper objects to run shell and SSH processes.
        """
        # Populate environment with required variables
        os.environ["HS2_HOST_PORT"] = self.config["hs2"]
        os.environ["HDFS_NN"] = self.config["namenode"]
        os.environ["IMPALAD"] = self.config["impalad"][0]
        os.environ["REMOTE_LOAD"] = "1"
        os.environ["HADOOP_USER_NAME"] = "hdfs"
        os.environ["TEST_WAREHOUSE_DIR"] = self.config["hive_warehouse_directory"]
        os.environ["KUDU_MASTER"] = self.config["kudu_master"]

        if self.options.snapshot_file is None:
            if "SNAPSHOT_DIR" in os.environ:
                snapshot_dir = os.environ["SNAPSHOT_DIR"]
            else:
                snapshot_dir = "{0}/testdata/test-warehouse-SNAPSHOT".format(
                    os.getenv("WORKSPACE"))
            if not os.path.isdir(snapshot_dir):
                err_msg = 'Snapshot directory "{0}" is not a valid directory'
                logger.error(err_msg.format(snapshot_dir))
                raise OSError("Could not find test-warehouse snapshot file.")

            logger.info("Snapshot directory: {0}".format(snapshot_dir))
            self.snapshot_file = self.find_snapshot_file(snapshot_dir)
        else:
            self.snapshot_file = self.options.snapshot_file

        # Prepare shortcuts for connecting to remote services
        self.gtw_ssh = ssh.bake("{0}@{1}".format(self.options.ssh_user, self.gateway),
                                "-oStrictHostKeyChecking=no",
                                "-oUserKnownHostsFile=/dev/null",
                                t=True, _out=tee, _err=tee)

        self.beeline = sh.beeline.bake(silent=False, outputformat="csv2", n="impala",
                                       u="jdbc:hive2://{0}/default".format(
                                           self.config["hs2"]))

        self.load_test_warehouse = sh.Command(
            "{0}/testdata/bin/load-test-warehouse-snapshot.sh".format(
                self.impala_home)).bake(
            _out=tee, _err=tee)

        self.create_load_data = sh.Command(
            "{0}/testdata/bin/create-load-data.sh".format(self.impala_home))

        self.main_impalad = self.config["impalad"][0]
        self.impala_shell = sh.Command("impala-shell.sh").bake(i=self.main_impalad,
                                                               _out=tee, _err=tee)

        self.python = sh.Command("impala-python").bake(u=True)
        self.compute_stats = sh.Command(
            "{0}/testdata/bin/compute-table-stats.sh".format(self.impala_home)).bake(
            _out=tee, _err=tee)

    @timing
    def load(self):
        """This method performs the actual data load. First it removes any known artifacts
        from the remote location. Next it drops potentially existing database from the
        Hive Metastore. Now, it invokes the load-test-warehouse-snapshot.sh and
        create-load-data.sh scripts with the appropriate parameters. The most important
        paramters are implicitly passed to the scripts as environment variables pointing
        to the remote HDFS, Hive and Impala.
        """
        exploration_strategy = self.options.exploration_strategy

        logger.info("Removing other databases")
        dblist = self.beeline(e="show databases;", _err=tee).stdout
        database_list = dblist.split()[1:]  # The first element is the header string
        for db in database_list:
            if db.strip() != "default":
                logger.debug("Dropping database %s", db)
                self.impala_shell(q="drop database if exists {0} cascade;".format(db))

        logger.info("Invalidating metadata in Impala")
        self.impala_shell(q="invalidate metadata;")

        logger.info("Removing previous remote {0}".format(
            self.config["hive_warehouse_directory"]))
        r = sh.hdfs.dfs("-rm", "-r", "-f", "{0}".format(
            self.config["hive_warehouse_directory"]))

        logger.info("Expunging HDFS trash")
        r = sh.hdfs.dfs("-expunge")

        logger.info("Uploading test warehouse snapshot")
        self.load_test_warehouse(self.snapshot_file)

        # TODO: We need to confirm that if we change any permissions, that we don't
        # affect any running tests. See IMPALA-4375.
        logger.info("Changing warehouse ownership")
        r = sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format(
            self.config["hive_warehouse_directory"]))
        sh.hdfs.dfs("-chmod", "-R", "g+rwx", "{0}".format(
            self.config["hive_warehouse_directory"]))
        sh.hdfs.dfs("-chmod", "1777", "{0}".format(
            self.config["hive_warehouse_directory"]))

        logger.info("Calling create_load_data.sh")
        # The $USER variable is used in the create-load-data.sh script for beeline
        # impersonation.
        new_env = os.environ.copy()
        new_env["LOGNAME"] = "impala"
        new_env["USER"] = "******"
        new_env["USERNAME"] = "******"

        # Regardless of whether we are in fact skipping the snapshot load or not,
        # we nonetheless always pass -skip_snapshot_load to create-load-data.sh.
        # This is because we have already loaded the snapshot earlier in this
        # script, so we don't want create-load-data.sh to invoke
        # load-test-warehouse-snapshot.sh again.
        #
        # It would actually be nice to be able to skip the snapshot load, but
        # because of the existing messiness of create-load-data.sh, we can't.
        # This invocation...
        #
        #    $ create-load-data.sh -skip_snapshot_load -exploration_strategy core
        #
        # ...results in this error:
        #
        #    Creating /test-warehouse HDFS directory \
        #    (logging to create-test-warehouse-dir.log)... FAILED
        #    'hadoop fs -mkdir /test-warehouse' failed. Tail of log:
        #    Log for command 'hadoop fs -mkdir /test-warehouse'
        #    mkdir: `/test-warehouse': File exists
        #
        # Similarly, even though we might pass in "core" as the exploration strategy,
        # because we aren't loading a metadata snapshot (i.e., -skip_metadata_load is
        # false), an exhaustive dataload will always be done. This again is the result
        # of logic in create-load-data.sh, which itself ignores the value passed in
        # for -exploration_strategy.
        #
        # See IMPALA-4399: "create-load-data.sh has bitrotted to some extent, and needs
        #                   to be cleaned up"
        create_load_data_args = ["-skip_snapshot_load", "-cm_host", self.cm_host,
                                 "-snapshot_file", self.snapshot_file,
                                 "-exploration_strategy", exploration_strategy]

        self.create_load_data(*create_load_data_args, _env=new_env, _out=tee, _err=tee)

        sh.hdfs.dfs("-chown", "-R", "impala:hdfs", "{0}".format(
            self.config["hive_warehouse_directory"]))

        logger.info("Re-load HBase data")
        # Manually load the HBase data last.
        self.python("{0}/bin/load-data.py".format(self.impala_home),
                    "--hive_warehouse_dir={0}".format(
                        self.config["hive_warehouse_directory"]),
                    "--table_formats=hbase/none",
                    "--hive_hs2_hostport={0}".format(self.config["hs2"]),
                    "--hdfs_namenode={0}".format(self.config["namenode"]),
                    "--exploration_strategy={0}".format(exploration_strategy),
                    workloads="functional-query",
                    force=True,
                    impalad=self.main_impalad,
                    _env=new_env,
                    _out=tee,
                    _err=tee)

        self.compute_stats()
        logger.info("Load data finished")

    # TODO: Should this be refactored out of this script? It has nothing to do with
    # data loading per se. If tests rely on the environment on the client being set
    # a certain way -- as in the prepare() method -- we may need to find another way
    # to deal with that. See IMPALA-4376.
    @timing
    def test(self):
        """Execute Impala's end-to-end tests against a remote cluster. All configuration
        paramters are picked from the cluster configuration that was fetched via the
        CM API."""

        # TODO: Running tests via runtest.py is currently not working against a remote
        # cluster (although running directly via py.test seems to work.) This method
        # may be refactored out of this file under IMPALA-4376, so for the time being,
        # raise a NotImplementedError.
        raise NotImplementedError

        # Overwrite the username to match the service user on the remote system and deal
        # with the assumption that in the local development environment the current user
        # is HDFS superuser as well.
        new_env = os.environ.copy()
        new_env["LOGNAME"] = "impala"
        new_env["USER"] = "******"
        new_env["USERNAME"] = "******"

        strategy = self.options.exploration_strategy
        logger.info("Running tests with exploration strategy {0}".format(strategy))
        run_tests = sh.Command("{0}/tests/run-tests.py".format(self.impala_home))
        run_tests("--skip_local_tests",
                  "--exploration_strategy={0}".format(strategy),
                  "--workload_exploration_strategy=functional-query:{0}".format(strategy),
                  "--namenode_http_address={0}".format(self.config["namenode_http"]),
                  "--hive_server2={0}".format(self.config["hs2"]),
                  "--metastore_server={0}".format(self.config["metastore"]),
                  "query_test",
                  maxfail=10,
                  impalad=",".join(self.config["impalad"]),
                  _env=new_env,
                  _out=tee,
                  _err=tee)
Ejemplo n.º 36
0
        #config dir for Cloudera agent /etc/cloudera-scm-agent
        cloudera_agent_config = codecs.open(r"/etc/cloudera-scm-agent/config.ini", encoding="utf-8", mode="r")
        cloudera_manager_host = re.search('(?<=server_host=).*',cloudera_agent_config.read()).group(0)
        cloudera_agent_config.close()
    except IOError:
        print "not running on a Cloudera manager host"
        exit(1)


api = ApiResource(cloudera_manager_host, server_port=args.port, username=args.username,
                  password=args.password)

#the user picked cluster or the only cluster managed by cloudera manager
cluster = None
# Get a list of all clusters
clusters=api.get_all_clusters()

for c in clusters:
    for h in c.list_hosts():
        host = hosts.get_host(api, h.hostId)
        if host.hostname == node or host.ipAddress == node:
            cluster = c
if cluster:
    services = cluster.get_all_services()
else:
    print "Couldn't find node in any cluster"
    exit(1)
groups = None

if args.service:
    service = find_service(services, args.service)
Ejemplo n.º 37
0
def main():
  module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS))

  api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=10)
  cluster_name = CLUSTER_NAME

  manager = api.get_cloudera_manager()

  action_a = module.params.get('action', None)

  if action_a == 'create_cluster':
    license_a = module.params.get('license', None)
    version_a = module.params.get('version', None)

    cluster_list = [x.name for x in api.get_all_clusters()]
    if cluster_name in cluster_list:
      module.exit_json(changed=False, msg='Cluster exists')
    else:
      cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a)
      if license_a == None:
        manager.begin_trial()
      else:
        manager.update_license(license_a.decode('base64'))
      module.exit_json(changed=True, msg='Cluster created')
  elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster','create_snapshot_policy']:
    # more complicated actions that need a created cluster go here
    cluster = api.get_cluster(cluster_name)
    host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts())

    # adds a host to the cluster
    # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal
    if action_a == 'add_host':
      host_a = module.params.get('host', None)

      host_list = host_map.keys()
      if host_a in host_list:
        module.exit_json(changed=False, msg='Host already in cluster')
      else:
        try:
          cluster.add_hosts([host_a])
        except ApiException:
          # if a host isn't there, it could be because the agent didn't manage to connect yet
          # so let's wait a moment for it
          sleep(120)
          cluster.add_hosts([host_a])

        module.exit_json(changed=True, msg='Host added')

    # create management service and set it's basic configuration
    # this needs a separate function since management is handled
    # differently than the rest of services
    elif action_a == 'create_mgmt':
      host_a = module.params.get('host', None)

      # getting the management service is the only way to check if mgmt exists
      # an exception means there isn't one
      try:
        mgmt = manager.get_service()
        module.exit_json(changed=False, msg='Mgmt service already exists')
      except ApiException:
        pass

      mgmt = manager.create_mgmt_service(ApiServiceSetupInfo())

      # this is ugly... and I see no good way to unuglify it
      firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")
      reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")

      # since there is no easy way of configuring the manager... let's do it here :(
      role_conf = defaultdict(dict)
      role_conf['ACTIVITYMONITOR'] = {
          'firehose_database_host': '{0}:7432'.format(host_a),
          'firehose_database_user': '******',
          'firehose_database_password': firehose_passwd,
          'firehose_database_type': 'postgresql',
          'firehose_database_name': 'amon',
          'firehose_heapsize': '268435456',
      }
      role_conf['EVENTSERVER'] = {
          'event_server_heapsize': '215964392'
      }
      role_conf['REPORTSMANAGER'] = {
          'headlamp_database_host': '{0}:7432'.format(host_a),
          'headlamp_database_user': '******',
          'headlamp_database_password': reports_passwd,
          'headlamp_database_type': 'postgresql',
          'headlamp_database_name': 'rman',
          'headlamp_heapsize': '215964392',
      }

      roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER']
      # create mangement roles
      for role in roles:
        mgmt.create_role('{0}-1'.format(role), role, host_map[host_a])

      # update configuration of each
      for group in mgmt.get_all_role_config_groups():
        group.update_config(role_conf[group.roleType])

      mgmt.start().wait()
      # after starting this service needs time to spin up
      sleep(30)
      module.exit_json(changed=True, msg='Mgmt created and started')

    # deploy a given parcel on all hosts in the cluster
    # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4
    elif action_a == 'deploy_parcel':
      name_a = module.params.get('name', None)
      version_a = module.params.get('version', None)

      if "latest" in version_a:
        available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a]
        if "-latest" in version_a:
          version_substr = match('(.+?)-latest', version_a).group(1)
        # if version is just "latest", try to check everything
        else:
          version_substr = ".*"
        try:
          [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None]
        except ValueError:
          module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions))
      else:
        version_parcel = version_a

      # we now go through various stages of getting the parcel
      # as there is no built-in way of waiting for an operation to complete
      # we use loops with sleep to get it done
      parcel = cluster.get_parcel(name_a, version_parcel)
      if parcel.stage == 'AVAILABLE_REMOTELY':
        parcel.start_download()

        while parcel.stage != 'DOWNLOADED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          sleep(10)

      if parcel.stage == 'DOWNLOADED':
        parcel.start_distribution()

        while parcel.stage != 'DISTRIBUTED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          # sleep while hosts report problems after the download
          for i in range(12):
            sleep(10)
            if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
              break

      # since parcels are distributed automatically when a new host is added to a cluster
      # we can encounter the ,,ACTIVATING'' stage then
      if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING':
        if parcel.stage == 'DISTRIBUTED':
          parcel.activate()

        while parcel.stage != 'ACTIVATED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          # this sleep has to be large because although the operation is very fast
          # it makes the management and cloudera hosts go bonkers, failing all of the health checks
          sleep(10)

        # sleep while hosts report problems after the distribution
        for i in range(60):
          sleep(10)
          if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
            break

        module.exit_json(changed=True, msg='Parcel activated')

      if parcel.stage == 'ACTIVATED':
        module.exit_json(changed=False, msg='Parcel already activated')

      # if we get down here, something is not right
      module.fail_json(msg='Invalid parcel state')

    # deploy nodes for workers, according to SERVICE_WORKER_MAP
    # also give them sane names and init zookeeper and kafka ones
    # which need id's specified
    elif action_a == 'deploy_service_worker_nodes':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      role_name = SERVICE_WORKER_MAP[service_a]['name']
      full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring']

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      nodes = [x for x in service.get_all_roles() if role_name in x.name]

      # if host already has the given group, we should skip it
      if host_map[host_a] in [x.hostRef.hostId for x in nodes]:
        module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name))
      # find out the highest id that currently exists
      else:
        node_names = [x.name for x in nodes]
        if len(node_names) == 0:
          # if no nodes, start numbering from 1
          node_i = 1
        else:
          # take the max number and add 1 to it
          node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1

        if service_name == 'ZOOKEEPER':
          role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a)
          # zookeeper needs a per-node ID in the configuration, so we set it now
          role.update_config({'serverId': node_i})
        elif service_name == 'KAFKA':
          role = service.create_role(full_role_name.format(node_i), role_name, host_a)
          # kafka needs a per-node ID in the configuration, so we set it now
          role.update_config({'broker.id': node_i})
        else:
          service.create_role(full_role_name.format(node_i), role_name, host_a)

        module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name))

    # deploy a service. just create it, don't do anything more
    # this is needed maily when we have to set service properties before role deployment
    elif action_a == 'deploy_service':
      name_a = module.params.get('name', None)

      if not name_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(name_a))
      service_name = SERVICE_MAP[name_a]
      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
        module.exit_json(changed=True, msg='{0} service created'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} service already exists'.format(service_name))

    # deploy the base hdfs roles (the namenode and secondary)
    # this doesn't create the service, as at least one datanode should already be added!
    # the format also requires certain properties to be set before we run it
    elif action_a == 'deploy_hdfs_base':
      nn_host_a = module.params.get('nn_host', None)
      sn_host_a = module.params.get('sn_host', None)

      changed = False

      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]

      # don't create a secondary namenode when:
      #- there is one that already exists
      #- there is a second namenode, which means we have HA and don't need a secondary
      if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles:
        hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a)
        changed = True

      # create a namenode and format it's FS
      # formating the namenode requires at least one datanode and secondary namenode already in the cluster!
      if not 'HDFS-NAMENODE' in hdfs_roles:
        hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a)
        for command in hdfs.format_hdfs('HDFS-NAMENODE'):
          if command.wait().success == False:
            module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage))
        changed = True

      module.exit_json(changed=changed, msg='Created HDFS service & NN roles')

    # enable HttpFS for HDFS
    # HUE require this for support HA in HDFS
    elif action_a == 'deploy_hdfs_httpfs':
      host_a = module.params.get('host', None)
      
      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]
      
      # don't install second instance of HttpFS
      if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0:
        module.exit_json(changed=False, msg='HDFS HttpFS service already exists')
       
      hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) 
        
      module.exit_json(changed=True, msg='HDFS HttpFS service created')
      
    # enable HA for HDFS
    # this deletes the secondary namenode and creates a second namenode in it's place
    # also, this spawns 3 journal node and 2 failover controller roles
    elif action_a == 'deploy_hdfs_ha':
      sn_host_a = module.params.get('sn_host', None)
      jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)]

      hdfs = cluster.get_service('HDFS')

      # if there's a second namenode, this means we already have HA enabled
      if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]:
        # this is bad and I should feel bad
        # jns is a list of dictionaries, each dict passes the required journalnode parameters
        jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': '/data0/hadoop/journal', 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)]

        # this call is so long because we set some predictable names for the sevices
        command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER',
                                    active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2')

        children = command.wait().children
        for command_children in children:
          # The format command is expected to fail, since we already formated the namenode
          if command_children.name != 'Format' and command.success == False:
            module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for HDFS service')
      else:
        module.exit_json(changed=False, msg='HDFS HA already enabled')
    # enable HA for YARN
    elif action_a == 'deploy_rm_ha':
      sn_host_a = module.params.get('sn_host', None)

      yarn = cluster.get_service('YARN')

      # if there are two roles matching to this name, this means HA for YARN is enabled
      if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1:
        command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER')
        children = command.wait().children
        for command_children in children:
          if command.success == False:
            module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for YARN service')
      else:
        module.exit_json(changed=False, msg='YARN HA already enabled')

    # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP
    # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP
    elif action_a == 'deploy_base_roles':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      changed = False

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      service_roles = [x.name for x in service.get_all_roles()]

      # create each service from the map
      for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items():
        # check if role already exists, script cant compare it directly
        # after enabling HA on YARN roles will have random strings in names
        if len([0 for x in service_roles if match(role_name, x) != None]) == 0:
          service.create_role(role_name, cloudera_name, host_a)
          changed = True

          # init commmands
          if role_name in SERVICE_INIT_COMMANDS.keys():
            for command_to_run in SERVICE_INIT_COMMANDS[role_name]:
              # different handling of commands specified by name and
              # ones specified by an instance method
              if ismethod(command_to_run):
                command = command_to_run(service)
              else:
                command = service.service_command_by_name(command_to_run)

              if command.wait().success == False:
                module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage))

      if changed == True:
        module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name))

    # set config values for a given service/role
    elif action_a == 'set_config':
      entity_a = module.params.get('entity', None)
      service_a = module.params.get('service', None)
      role_a = module.params.get('role', None)
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)

      if not service_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(service_a))

      # since management is handled differently, it needs a different service
      if service_a == 'management':
        service = manager.get_service()
      elif service_a == 'cm':
        service = manager
      else:
        service = cluster.get_service(SERVICE_MAP[service_a])

      # role and service configs are handled differently
      if entity_a == 'service':
        prev_config = service.get_config()
        curr_config = service.update_config({name_a: value_a})
        if service_a == 'cm':
          prev_config = [prev_config]
          curr_config = [curr_config]
        module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a]))

      elif entity_a == 'role':
        if not role_a in ROLE_MAP:
          module.fail_json(msg='Unknown role: {0}'.format(service))

        role = service.get_role_config_group(ROLE_MAP[role_a])
        prev_config = role.get_config()
        curr_config = role.update_config({name_a: value_a})
        module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a]))

      else:
        module.fail_json(msg='Invalid entity, must be one of service, role')

    # handle service state
    # currently this only can start/restart a service
    elif action_a == 'service':
      state_a = module.params.get('state', None)
      service_a = module.params.get('service', None)

      try:
        if service_a == 'cm':
          service = manager.get_service()
        else:
          service = cluster.get_service(SERVICE_MAP[service_a])
      except ApiException:
        module.fail_json(msg='Service does not exist')

      # when starting a service, we also deploy the client config for it
      if state_a == 'started':
        if service.serviceState == 'STARTED':
          module.exit_json(changed=False, msg='Service already running')
        method = service.start
        verb = "start"
      elif state_a == 'restarted':
        method = service.restart
        verb = "restart"

      try:
        command = service.deploy_client_config()
        if command.wait().success == False:
          module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage))
      # since there is no way to check if a service handles client config deployments
      # we try our best and pass the exception if it doesn't
      except ApiException, AttributeError:
        pass

      method().wait()
      # we need to wait for cloudera checks to complete...
      # otherwise it will report as failing
      sleep(10)
      for i in range(24):
        sleep(10)
        service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
        if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
          break
      service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
      if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
        module.exit_json(changed=True, msg='Service {0} successful'.format(verb))
      else:
        module.fail_json(msg='Service {0} failed'.format(verb))

    # handle cluster
    # currently this only can restart
    elif action_a == 'cluster':
      state_a = module.params.get('state', None)

      if state_a == 'restarted':
        command = cluster.restart(redeploy_client_configuration=True)
        if command.wait().success == False:
          module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage))
        else:
          module.exit_json(changed=True, msg='Cluster restart successful')

    # Snapshot policy
    # only create is supported
    elif action_a == 'create_snapshot_policy':
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)
      service_a = module.params.get('service', None)
      service = cluster.get_service(SERVICE_MAP[service_a])
      payload=loads(value_a)
      # checking if policy already exists. Exception is expected when configure for the first time.
      try: 
        test = service.get_snapshot_policy(name_a)
        module.exit_json(changed=False, msg='Defined policy already exists')
      except ApiException:
        pass
      try:
        command = service.create_snapshot_policy(payload)
        module.exit_json(changed=True, msg='Snapshot policy was created.')
      except ApiException, AttributeError:
        module.fail_json(msg='ERROR in creating snapshot policy.')
Ejemplo n.º 38
0
def get_cluster():
    # connect to cloudera manager
    api = ApiResource(CM_HOST, username="******", password="******")
    # Take care of the case where cluster name has changed
    # Hopefully users wouldn't use this CM to deploy another cluster manually
    return (api, api.get_cluster(api.get_all_clusters()[0].name))
Ejemplo n.º 39
0
Archivo: config.py Proyecto: xoltar/atk
        cloudera_manager_host = re.search(
            '(?<=server_host=).*', cloudera_agent_config.read()).group(0)
        cloudera_agent_config.close()
    except IOError:
        print "not running on a Cloudera manager host"
        exit(1)

api = ApiResource(cloudera_manager_host,
                  server_port=args.port,
                  username=args.username,
                  password=args.password)

#the user picked cluster or the only cluster managed by cloudera manager
cluster = None
# Get a list of all clusters
clusters = api.get_all_clusters()

for c in clusters:
    for h in c.list_hosts():
        host = hosts.get_host(api, h.hostId)
        if host.hostname == node or host.ipAddress == node:
            cluster = c
if cluster:
    services = cluster.get_all_services()
else:
    print "Couldn't find node in any cluster"
    exit(1)
groups = None

if args.service:
    service = find_service(services, args.service)
Ejemplo n.º 40
0
def main():
  module = AnsibleModule(argument_spec=dict((argument, {'type': 'str'}) for argument in MODULE_ARGUMENTS))

  api = ApiResource('localhost', username=ADMIN_USER, password=ADMIN_PASS, version=9)
  cluster_name = CLUSTER_NAME

  manager = api.get_cloudera_manager()

  action_a = module.params.get('action', None)

  if action_a == 'create_cluster':
    license_a = module.params.get('license', None)
    version_a = module.params.get('version', None)

    cluster_list = [x.name for x in api.get_all_clusters()]
    if cluster_name in cluster_list:
      module.exit_json(changed=False, msg='Cluster exists')
    else:
      cluster = api.create_cluster(CLUSTER_NAME, fullVersion=version_a)
      if license_a == None:
        manager.begin_trial()
      else:
        manager.update_license(license_a.decode('base64'))
      module.exit_json(changed=True, msg='Cluster created')
  elif action_a in ['add_host', 'create_mgmt', 'deploy_parcel', 'deploy_hdfs_base', 'deploy_hdfs_httpfs', 'deploy_hdfs_dn', 'deploy_hdfs_ha', 'deploy_rm_ha', 'set_config', 'service', 'deploy_service', 'deploy_service_worker_nodes', 'deploy_base_roles', 'run_command', 'cluster', 'create_snapshot_policy', 'deploy_configuration']:
    # more complicated actions that need a created cluster go here
    cluster = api.get_cluster(cluster_name)
    host_map = dict((api.get_host(x.hostId).hostname, x.hostId) for x in cluster.list_hosts())

    # adds a host to the cluster
    # host_name should be in the internal DNS format, ip-xx-xx-xx.copute.internal
    if action_a == 'add_host':
      host_a = module.params.get('host', None)

      host_list = host_map.keys()
      if host_a in host_list:
        module.exit_json(changed=False, msg='Host already in cluster')
      else:
        try:
          cluster.add_hosts([host_a])
        except ApiException:
          # if a host isn't there, it could be because the agent didn't manage to connect yet
          # so let's wait a moment for it
          sleep(120)
          cluster.add_hosts([host_a])

        module.exit_json(changed=True, msg='Host added')

    # create management service and set it's basic configuration
    # this needs a separate function since management is handled
    # differently than the rest of services
    elif action_a == 'create_mgmt':
      host_a = module.params.get('host', None)

      # getting the management service is the only way to check if mgmt exists
      # an exception means there isn't one
      try:
        mgmt = manager.get_service()
        module.exit_json(changed=False, msg='Mgmt service already exists')
      except ApiException:
        pass

      mgmt = manager.create_mgmt_service(ApiServiceSetupInfo())

      # this is ugly... and I see no good way to unuglify it
      firehose_passwd = Popen("sudo grep com.cloudera.cmf.ACTIVITYMONITOR.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")
      reports_passwd = Popen("sudo grep com.cloudera.cmf.REPORTSMANAGER.db.password /etc/cloudera-scm-server/db.mgmt.properties | awk -F'=' '{print $2}'", shell=True, stdout=PIPE).stdout.read().rstrip("\n")

      # since there is no easy way of configuring the manager... let's do it here :(
      role_conf = defaultdict(dict)
      role_conf['ACTIVITYMONITOR'] = {
          'firehose_database_host': '{0}:7432'.format(host_a),
          'firehose_database_user': '******',
          'firehose_database_password': firehose_passwd,
          'firehose_database_type': 'postgresql',
          'firehose_database_name': 'amon',
          'firehose_heapsize': '268435456',
      }
      role_conf['EVENTSERVER'] = {
          'event_server_heapsize': '215964392'
      }
      role_conf['REPORTSMANAGER'] = {
          'headlamp_database_host': '{0}:7432'.format(host_a),
          'headlamp_database_user': '******',
          'headlamp_database_password': reports_passwd,
          'headlamp_database_type': 'postgresql',
          'headlamp_database_name': 'rman',
          'headlamp_heapsize': '268435456',
      }

      roles = ['ACTIVITYMONITOR', 'ALERTPUBLISHER', 'EVENTSERVER', 'HOSTMONITOR', 'SERVICEMONITOR', 'REPORTSMANAGER']
      # create mangement roles
      for role in roles:
        mgmt.create_role('{0}-1'.format(role), role, host_map[host_a])

      # update configuration of each
      for group in mgmt.get_all_role_config_groups():
        group.update_config(role_conf[group.roleType])

      mgmt.start().wait()
      # after starting this service needs time to spin up
      sleep(30)
      module.exit_json(changed=True, msg='Mgmt created and started')

    # deploy a given parcel on all hosts in the cluster
    # you can specify a substring of the version ending with latest, for example 5.3-latest instead of 5.3.5-1.cdh5.3.5.p0.4
    elif action_a == 'deploy_parcel':
      name_a = module.params.get('name', None)
      version_a = module.params.get('version', None)

      if "latest" in version_a:
        available_versions = [x.version for x in cluster.get_all_parcels() if x.product == name_a]
        if "-latest" in version_a:
          version_substr = match('(.+?)-latest', version_a).group(1)
        # if version is just "latest", try to check everything
        else:
          version_substr = ".*"
        try:
          [version_parcel] = [x for x in available_versions if re.match(version_substr, x) != None]
        except ValueError:
          module.fail_json(msg='Specified version {0} doesnt appear in {1} or appears twice'.format(version_substr, available_versions))
      else:
        version_parcel = version_a

      # we now go through various stages of getting the parcel
      # as there is no built-in way of waiting for an operation to complete
      # we use loops with sleep to get it done
      parcel = cluster.get_parcel(name_a, version_parcel)
      if parcel.stage == 'AVAILABLE_REMOTELY':
        parcel.start_download()

        while parcel.stage != 'DOWNLOADED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          sleep(10)

      if parcel.stage == 'DOWNLOADED':
        parcel.start_distribution()

        while parcel.stage != 'DISTRIBUTED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          if parcel.state.errors:
            raise Exception(str(parcel.state.errors))
          # sleep while hosts report problems after the download
          for i in range(12):
            sleep(10)
            if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
              break

      # since parcels are distributed automatically when a new host is added to a cluster
      # we can encounter the ,,ACTIVATING'' stage then
      if parcel.stage == 'DISTRIBUTED' or parcel.stage == 'ACTIVATING':
        if parcel.stage == 'DISTRIBUTED':
          parcel.activate()

        while parcel.stage != 'ACTIVATED':
          parcel = cluster.get_parcel(name_a, version_parcel)
          # this sleep has to be large because although the operation is very fast
          # it makes the management and cloudera hosts go bonkers, failing all of the health checks
          sleep(10)

        # sleep while hosts report problems after the distribution
        for i in range(60):
          sleep(10)
          if sum([1 for x in api.get_all_hosts(view='Full') if x.healthSummary != 'GOOD']) == 0:
            break

        module.exit_json(changed=True, msg='Parcel activated')

      if parcel.stage == 'ACTIVATED':
        module.exit_json(changed=False, msg='Parcel already activated')

      # if we get down here, something is not right
      module.fail_json(msg='Invalid parcel state')

    # deploy nodes for workers, according to SERVICE_WORKER_MAP
    # also give them sane names and init zookeeper and kafka ones
    # which need id's specified
    elif action_a == 'deploy_service_worker_nodes':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      role_name = SERVICE_WORKER_MAP[service_a]['name']
      full_role_name = SERVICE_WORKER_MAP[service_a]['formatstring']

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      nodes = [x for x in service.get_all_roles() if role_name in x.name]

      # if host already has the given group, we should skip it
      if host_map[host_a] in [x.hostRef.hostId for x in nodes]:
        module.exit_json(changed=False, msg='Host already is a {0}'.format(role_name))
      # find out the highest id that currently exists
      else:
        node_names = [x.name for x in nodes]
        if len(node_names) == 0:
          # if no nodes, start numbering from 1
          node_i = 1
        else:
          # take the max number and add 1 to it
          node_i = max([int(x.split('-')[-1]) for x in node_names]) + 1

        if service_name == 'ZOOKEEPER':
          role = service.create_role(full_role_name.format(node_i), 'SERVER', host_a)
          # zookeeper needs a per-node ID in the configuration, so we set it now
          role.update_config({'serverId': node_i})
        elif service_name == 'KAFKA':
          role = service.create_role(full_role_name.format(node_i), role_name, host_a)
          # kafka needs a per-node ID in the configuration, so we set it now
          role.update_config({'broker.id': node_i})
        else:
          service.create_role(full_role_name.format(node_i), role_name, host_a)

        module.exit_json(changed=True, msg='Added host to {0} role'.format(role_name))

    # deploy a service. just create it, don't do anything more
    # this is needed maily when we have to set service properties before role deployment
    elif action_a == 'deploy_service':
      name_a = module.params.get('name', None)

      if not name_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(name_a))
      service_name = SERVICE_MAP[name_a]
      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
        module.exit_json(changed=True, msg='{0} service created'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} service already exists'.format(service_name))

    # deploy the base hdfs roles (the namenode and secondary)
    # this doesn't create the service, as at least one datanode should already be added!
    # the format also requires certain properties to be set before we run it
    elif action_a == 'deploy_hdfs_base':
      nn_host_a = module.params.get('nn_host', None)
      sn_host_a = module.params.get('sn_host', None)

      changed = False

      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]

      # don't create a secondary namenode when:
      #- there is one that already exists
      #- there is a second namenode, which means we have HA and don't need a secondary
      if not 'HDFS-SECONDARYNAMENODE' in hdfs_roles and not 'HDFS-NAMENODE-2' in hdfs_roles:
        hdfs.create_role('HDFS-SECONDARYNAMENODE', 'SECONDARYNAMENODE', sn_host_a)
        changed = True

      # create a namenode and format it's FS
      # formating the namenode requires at least one datanode and secondary namenode already in the cluster!
      if not 'HDFS-NAMENODE' in hdfs_roles:
        hdfs.create_role('HDFS-NAMENODE', 'NAMENODE', nn_host_a)
        for command in hdfs.format_hdfs('HDFS-NAMENODE'):
          if command.wait().success == False:
            module.fail_json(msg='Failed formating HDFS namenode with error: {0}'.format(command.resultMessage))
        changed = True

      module.exit_json(changed=changed, msg='Created HDFS service & NN roles')

    # enable HttpFS for HDFS
    # HUE require this for support HA in HDFS
    elif action_a == 'deploy_hdfs_httpfs':
      host_a = module.params.get('host', None)
      
      hdfs = cluster.get_service('HDFS')
      hdfs_roles = [x.name for x in hdfs.get_all_roles()]
      
      # don't install second instance of HttpFS
      if len([role for role in hdfs_roles if 'HDFS-HTTPFS' in role]) != 0:
        module.exit_json(changed=False, msg='HDFS HttpFS service already exists')
       
      hdfs.create_role('HDFS-HTTPFS-1', 'HTTPFS', host_map[host_a]) 
        
      module.exit_json(changed=True, msg='HDFS HttpFS service created')
      
    # enable HA for HDFS
    # this deletes the secondary namenode and creates a second namenode in it's place
    # also, this spawns 3 journal node and 2 failover controller roles
    elif action_a == 'deploy_hdfs_ha':
      sn_host_a = module.params.get('sn_host', None)
      jn_dir_a = module.params.get('jn_dir', None)
      jn_names_a = [module.params.get('jn1_host', None), module.params.get('jn2_host', None), module.params.get('jn3_host', None)]

      hdfs = cluster.get_service('HDFS')

      # if there's a second namenode, this means we already have HA enabled
      if not 'HDFS-NAMENODE-2' in [x.name for x in hdfs.get_all_roles()]:
        # this is bad and I should feel bad
        # jns is a list of dictionaries, each dict passes the required journalnode parameters
        jns = [{'jnHostId': host_map[jn_name], 'jnEditsDir': jn_dir_a, 'jnName': 'HDFS-JOURNALNODE-{0}'.format(i + 1)} for i, jn_name in enumerate(jn_names_a)]

        # this call is so long because we set some predictable names for the sevices
        command = hdfs.enable_nn_ha('HDFS-NAMENODE', host_map[sn_host_a], 'nameservice1', jns, zk_service_name='ZOOKEEPER',
                                    active_fc_name='HDFS-FAILOVERCONTROLLER-1', standby_fc_name='HDFS-FAILOVERCONTROLLER-2', standby_name='HDFS-NAMENODE-2')

        children = command.wait().children
        for command_children in children:
          # The format command is expected to fail, since we already formated the namenode
          if command_children.name != 'Format' and command.success == False:
            module.fail_json(msg='Command {0} failed when enabling HDFS HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for HDFS service')
      else:
        module.exit_json(changed=False, msg='HDFS HA already enabled')
    # enable HA for YARN
    elif action_a == 'deploy_rm_ha':
      sn_host_a = module.params.get('sn_host', None)

      yarn = cluster.get_service('YARN')

      # if there are two roles matching to this name, this means HA for YARN is enabled
      if len([0 for x in yarn.get_all_roles() if match('^YARN-RESOURCEMANAGER.*$', x.name) != None]) == 1:
        command = yarn.enable_rm_ha(sn_host_a, zk_service_name='ZOOKEEPER')
        children = command.wait().children
        for command_children in children:
          if command.success == False:
            module.fail_json(msg='Command {0} failed when enabling YARN HA with error {1}'.format(command_children.name, command_children.resultMessage))
        module.exit_json(changed=True, msg='Enabled HA for YARN service')
      else:
        module.exit_json(changed=False, msg='YARN HA already enabled')

    # deploy the base roles for a service, according to BASE_SERVICE_ROLE_MAP
    # after the deployments run commands specified in BASE_SERVICE_ROLE_MAP
    elif action_a == 'deploy_base_roles':
      host_a = module.params.get('host', None)
      service_a = module.params.get('service', None)

      service_name = SERVICE_MAP[service_a]
      changed = False

      if not service_name in [x.name for x in cluster.get_all_services()]:
        service = cluster.create_service(service_name, service_name)
      else:
        service = cluster.get_service(service_name)

      service_roles = [x.name for x in service.get_all_roles()]

      # create each service from the map
      for (role_name, cloudera_name) in BASE_SERVICE_ROLE_MAP[service_a].items():
        # check if role already exists, script cant compare it directly
        # after enabling HA on YARN roles will have random strings in names
        if len([0 for x in service_roles if match(role_name, x) != None]) == 0:
          service.create_role(role_name, cloudera_name, host_a)
          changed = True

          # init commmands
          if role_name in SERVICE_INIT_COMMANDS.keys():
            for command_to_run in SERVICE_INIT_COMMANDS[role_name]:
              # different handling of commands specified by name and
              # ones specified by an instance method
              if ismethod(command_to_run):
                command = command_to_run(service)
              else:
                command = service.service_command_by_name(command_to_run)

              if command.wait().success == False:
                module.fail_json(msg='Running {0} failed with {1}'.format(command_to_run, command.resultMessage))

      if changed == True:
        module.exit_json(changed=True, msg='Created base roles for {0}'.format(service_name))
      else:
        module.exit_json(changed=False, msg='{0} base roles already exist'.format(service_name))

    # deploy configuration - it always return changed
    elif action_a == 'deploy_configuration':
      service_a = module.params.get('service', None)
      service_name = SERVICE_MAP[service_a]
      service = cluster.get_service(service_name)

      # deploying client configuration
      command = service.deploy_client_config()
      if command.wait().success == False:
        module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage))
      module.exit_json(changed=True, msg='Configuration deployed')
        
    # set config values for a given service/role
    elif action_a == 'set_config':
      entity_a = module.params.get('entity', None)
      service_a = module.params.get('service', None)
      role_a = module.params.get('role', None)
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)

      if not service_a in SERVICE_MAP:
        module.fail_json(msg='Unknown service: {0}'.format(service_a))

      # since management is handled differently, it needs a different service
      if service_a == 'management':
        service = manager.get_service()
      elif service_a == 'cm':
        service = manager
      else:
        service = cluster.get_service(SERVICE_MAP[service_a])

      # role and service configs are handled differently
      if entity_a == 'service':
        prev_config = service.get_config()
        curr_config = service.update_config({name_a: value_a})
        if service_a == 'cm':
          prev_config = [prev_config]
          curr_config = [curr_config]
        module.exit_json(changed=(str(prev_config[0]) != str(curr_config[0])), msg='Config value for {0}: {1}'.format(name_a, curr_config[0][name_a]))

      elif entity_a == 'role':
        if not role_a in ROLE_MAP:
          module.fail_json(msg='Unknown role: {0}'.format(service))

        role = service.get_role_config_group(ROLE_MAP[role_a])
        prev_config = role.get_config()
        curr_config = role.update_config({name_a: value_a})
        module.exit_json(changed=(str(prev_config) != str(curr_config)), msg='Config value for {0}: {1}'.format(name_a, curr_config[name_a]))

      else:
        module.fail_json(msg='Invalid entity, must be one of service, role')

    # handle service state
    # currently this only can start/restart a service
    elif action_a == 'service':
      state_a = module.params.get('state', None)
      service_a = module.params.get('service', None)

      try:
        if service_a == 'cm':
          service = manager.get_service()
        else:
          service = cluster.get_service(SERVICE_MAP[service_a])
      except ApiException:
        module.fail_json(msg='Service does not exist')

      # when starting a service, we also deploy the client config for it
      if state_a == 'started':
        if service.serviceState == 'STARTED':
          module.exit_json(changed=False, msg='Service already running')
        method = service.start
        verb = "start"
      elif state_a == 'restarted':
        method = service.restart
        verb = "restart"

      try:
        command = service.deploy_client_config()
        if command.wait().success == False:
          module.fail_json(msg='Deploying client config failed with {0}'.format(command.resultMessage))
      # since there is no way to check if a service handles client config deployments
      # we try our best and pass the exception if it doesn't
      except ApiException, AttributeError:
        pass

      method().wait()
      # we need to wait for cloudera checks to complete...
      # otherwise it will report as failing
      sleep(10)
      for i in range(24):
        sleep(10)
        service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
        if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
          break
      service = manager.get_service() if service_a == "cm" else cluster.get_service(SERVICE_MAP[service_a])
      if service.serviceState == 'STARTED' and service.healthSummary == 'GOOD':
        module.exit_json(changed=True, msg='Service {0} successful'.format(verb))
      else:
        module.fail_json(msg='Service {0} failed'.format(verb))

    # handle cluster
    # currently this only can restart
    elif action_a == 'cluster':
      state_a = module.params.get('state', None)

      if state_a == 'restarted':
        command = cluster.restart(redeploy_client_configuration=True)
        if command.wait().success == False:
          module.fail_json(msg='Cluster resart failed with {0}'.format(command.resultMessage))
        else:
          module.exit_json(changed=True, msg='Cluster restart successful')

    # Snapshot policy
    # only create is supported
    elif action_a == 'create_snapshot_policy':
      name_a = module.params.get('name', None)
      value_a = module.params.get('value', None)
      service_a = module.params.get('service', None)
      service = cluster.get_service(SERVICE_MAP[service_a])
      payload=loads(value_a)
      # checking if policy already exists. Exception is expected when configure for the first time.
      try: 
        test = service.get_snapshot_policy(name_a)
        module.exit_json(changed=False, msg='Defined policy already exists')
      except ApiException:
        pass
      try:
        command = service.create_snapshot_policy(payload)
        module.exit_json(changed=True, msg='Snapshot policy was created.')
      except ApiException, AttributeError:
        module.fail_json(msg='ERROR in creating snapshot policy.')
Ejemplo n.º 41
0
def main(cm_fqhn, cm_user_name, cm_user_password, cm_cluster_name,
         cm_tls_enabled, cm_tls_cafile):
    #print  cm_fqhn, cm_user_name, cm_user_password, cm_cluster_name, cm_tls_enabled, cm_tls_cafile
    if cm_tls_enabled == 'false':
        api = ApiResource(server_host=cm_fqhn,
                          username=cm_user_name,
                          password=cm_user_password)
    else:
        #context = ssl.create_default_context(cafile='/opt/cloudera/security/certs/ChainedCA.cert.pem')
        context = ssl.create_default_context(cafile=cm_tls_cafile)
        api = ApiResource(server_host=cm_fqhn,
                          username=cm_user_name,
                          password=cm_user_password,
                          use_tls=True,
                          ssl_context=context)

    # Get a list of all clusters
    cdh_cluster = None

    for c in api.get_all_clusters():
        if c.name == cm_cluster_name:
            print '\nCluster:', c
            cdh_cluster = c
            for x in cdh_cluster.list_hosts():
                HOST_NAME2ID_MAP[api.get_host(x.hostId).hostname] = x.hostId
                HOST_ID2NAME_MAP[x.hostId] = api.get_host(x.hostId).hostname
            print '\nHostName to HostId Mapping:'
            for x in HOST_NAME2ID_MAP:
                print x, HOST_NAME2ID_MAP[x]
            print '\nHostId to HostName Mapping:'
            for x in HOST_ID2NAME_MAP:
                print x, HOST_ID2NAME_MAP[x]
            print '\nServices:'
            for x in cdh_cluster.get_all_services():
                print x.type

            #ZooKeeper
            #zk_client_port = getKeyValueByServiceTypeAndRoleType(cdh_cluster,
            #                                    SERVICE_TYPE_MAP['zookeeper'],
            #                                    SERVICE_ROLE_TYPE_MAP['zookeeper'],
            #                                    'clientPort');
            zk_service = getServiceByServiceType(cdh_cluster,
                                                 SERVICE_TYPE_MAP['zookeeper'])
            zk_server_rcg = getRCGByServiceAndRoleType(
                zk_service, SERVICE_ROLE_TYPE_MAP['zookeeper_server'])
            zk_client_port = geValueByKeyInRCG(
                zk_server_rcg, CONFIG_PROPERTY_MAP['zk_client_port'])
            if zk_client_port != None:
                CONFIG_KEY_VALUE_MAP['ZOOKEEPER_PORT'] = zk_client_port
            zk_hosts = getHostsByServiceAndRoleType(
                zk_service, SERVICE_ROLE_TYPE_MAP['zookeeper_server'])
            #print 'ZOOKEEPER HOSTS:', zk_hosts
            if len(zk_hosts) > 0:
                CONFIG_KEY_VALUE_MAP['ZOOKEEPER_QUORUM'] = ' '.join(zk_hosts)

            #HDFS
            hdfs_service = getServiceByServiceType(cdh_cluster,
                                                   SERVICE_TYPE_MAP['hdfs'])
            hdfs_nn_rcg = getRCGByServiceAndRoleType(
                hdfs_service, SERVICE_ROLE_TYPE_MAP['namenode'])
            #inspectKVsInRCG(hdfs_nn_rcg)
            hdfs_nn_ns = geValueByKeyInRCG(hdfs_nn_rcg,
                                           CONFIG_PROPERTY_MAP['hdf_nn_ns'])
            #print 'HDFS NAMENODE NAMESERVICE:', hdfs_nn_ns
            hdfs_nn_port = geValueByKeyInRCG(
                hdfs_nn_rcg, CONFIG_PROPERTY_MAP['hdf_nn_port'])
            #print 'HDFS NAMENODE PORT:', hdfs_nn_port
            if hdfs_nn_port == None:
                hdfs_nn_port = CONFIG_KEY_VALUE_MAP['NAME_NODE_PORT']
            else:
                CONFIG_KEY_VALUE_MAP['NAME_NODE_PORT'] = hdfs_nn_port
            nn_hosts = None
            if hdfs_nn_ns == None:
                nn_hosts = getHostsByServiceAndRoleType(
                    hdfs_service, SERVICE_ROLE_TYPE_MAP['namenode'])
                #print 'HDFS NAMENODE HOSTS:', nn_hosts
                CONFIG_KEY_VALUE_MAP[
                    'NAME_NODE'] = 'hdfs://' + nn_hosts[0] + ':' + hdfs_nn_port
            else:
                CONFIG_KEY_VALUE_MAP['NAME_NODE'] = hdfs_nn_ns

            #YARN
            yarn_service = getServiceByServiceType(cdh_cluster,
                                                   SERVICE_TYPE_MAP['yarn'])
            #inspectRolesByService(yarn_service)
            #inspectRCGs(yarn_service)
            yarn_jt_rcg = getRCGByServiceAndRoleType(
                yarn_service, SERVICE_ROLE_TYPE_MAP['resourcemanager'])
            #inspectKVsInRCG(yarn_jt_rcg)
            yarn_rm_address = geValueByKeyInRCG(
                yarn_jt_rcg, CONFIG_PROPERTY_MAP['yarn_rm_address'])
            if yarn_rm_address == None:
                yarn_rm_address = CONFIG_KEY_VALUE_MAP[
                    'RESOURCEMANAGER_ADDRESS']
            else:
                CONFIG_KEY_VALUE_MAP[
                    'RESOURCEMANAGER_ADDRESS'] = yarn_rm_address
            rm_hosts = getHostsByServiceAndRoleType(
                yarn_service, SERVICE_ROLE_TYPE_MAP['resourcemanager'])
            #print 'YARN RESOURCEMANGER HOSTS:', rm_hosts
            CONFIG_KEY_VALUE_MAP[
                'JOB_TRACKER'] = rm_hosts[0] + ':' + yarn_rm_address

            #OOZIE
            oozie_service = getServiceByServiceType(cdh_cluster,
                                                    SERVICE_TYPE_MAP['oozie'])
            #inspectConfigByService(oozie_service)
            oozie_use_ssl = getValueByKeyServiceConfig(
                oozie_service, CONFIG_PROPERTY_MAP['oozie_use_ssl'])
            #print 'OOZIE TLS/SSL:', oozie_use_ssl
            if oozie_use_ssl == 'true':
                CONFIG_KEY_VALUE_MAP['OOZIE_USE_SSL'] = 'true'
            oozie_LB = getValueByKeyServiceConfig(
                oozie_service, CONFIG_PROPERTY_MAP['oozie_load_balancer'])

            #inspectRolesByService(oozie_service)
            #inspectRCGs(oozie_service)
            oozie_server_rcg = getRCGByServiceAndRoleType(
                oozie_service, SERVICE_ROLE_TYPE_MAP['oozie_server'])
            #inspectKVsInRCG(oozie_server_rcg)
            oozie_http_port = geValueByKeyInRCG(
                oozie_server_rcg, CONFIG_PROPERTY_MAP['oozie_http_port'])
            oozie_https_port = geValueByKeyInRCG(
                oozie_server_rcg, CONFIG_PROPERTY_MAP['oozie_https_port'])
            if oozie_http_port == None:
                oozie_http_port = CONFIG_KEY_VALUE_MAP['OOZIE_HTTP_PORT']
            if oozie_https_port == None:
                oozie_https_port = CONFIG_KEY_VALUE_MAP['OOZIE_HTTPS_PORT']
            #print 'OOOZIE http(s) ports:', oozie_http_port, oozie_https_port
            oozie_hosts = getHostsByServiceAndRoleType(
                oozie_service, SERVICE_ROLE_TYPE_MAP['oozie_server'])
            #print oozie_hosts
            if CONFIG_KEY_VALUE_MAP['OOZIE_USE_SSL'] == 'true':
                if oozie_LB != None:
                    CONFIG_KEY_VALUE_MAP['OOZIE_URL'] = 'https://' + oozie_LB
                else:
                    CONFIG_KEY_VALUE_MAP[
                        'OOZIE_URL'] = 'http://' + oozie_hosts[
                            0] + ':' + CONFIG_KEY_VALUE_MAP[
                                'OOZIE_HTTPS_PORT'] + '/oozie'
            else:
                if oozie_LB != None:
                    CONFIG_KEY_VALUE_MAP['OOZIE_URL'] = 'http://' + oozie_LB
                else:
                    CONFIG_KEY_VALUE_MAP[
                        'OOZIE_URL'] = 'http://' + oozie_hosts[
                            0] + ':' + CONFIG_KEY_VALUE_MAP[
                                'OOZIE_HTTP_PORT'] + '/oozie'

            #HBASE
            hbase_service = getServiceByServiceType(cdh_cluster,
                                                    SERVICE_TYPE_MAP['hbase'])
            #inspectConfigByService(hbase_service)
            #inspectRolesByService(hbase_service)
            hbase_rs_rcg = getRCGByServiceAndRoleType(
                hbase_service, SERVICE_ROLE_TYPE_MAP['hbase_restserver'])
            #inspectKVsInRCG(hbase_rs_rcg)
            hbase_rs_port = geValueByKeyInRCG(
                hbase_rs_rcg, CONFIG_PROPERTY_MAP['hbase_rs_port'])
            if hbase_rs_port != None:
                CONFIG_KEY_VALUE_MAP['HBASE_REST_PORT'] = hbase_rs_port
            hbase_rs_hosts = getHostsByServiceAndRoleType(
                hbase_service, SERVICE_ROLE_TYPE_MAP['hbase_restserver'])
            CONFIG_KEY_VALUE_MAP['HBASE_REST_IP'] = hbase_rs_hosts[0]

            #KAFKA
            kafka_service = getServiceByServiceType(cdh_cluster,
                                                    SERVICE_TYPE_MAP['kafka'])
            #inspectConfigByService(kafka_service)
            #inspectRolesByService(kafka_service)
            kafka_broker_rcg = getRCGByServiceAndRoleType(
                kafka_service, SERVICE_ROLE_TYPE_MAP['kafka_broker'])
            #inspectKVsInRCG(kafka_broker_rcg)
            kafka_client_security_protocol = geValueByKeyInRCG(
                kafka_broker_rcg,
                CONFIG_PROPERTY_MAP['kafka_client_security_protocol'])
            if kafka_client_security_protocol != None:
                CONFIG_KEY_VALUE_MAP[
                    'KAFKA_SECURITY_PROTOCOL'] = kafka_client_security_protocol
            kafka_broker_hosts = getHostsByServiceAndRoleType(
                kafka_service, SERVICE_ROLE_TYPE_MAP['kafka_broker'])
            if len(kafka_broker_hosts) > 0:
                CONFIG_KEY_VALUE_MAP['KAFKA_BROKER'] = ' '.join(zk_hosts)

            # Print all
            print '\nOUTPUT:\n', CONFIG_KEY_VALUE_MAP
Ejemplo n.º 42
0
def main(argv):
    #choosing a date format for the report
    fmt = '%Y-%m-%d %H:%M:%S %Z'
    current_datetime = datetime.datetime.now()
    current_date = current_datetime.date()
    str_current_datetime = str(current_datetime)
    str_current_date = str(current_date)

    ### Initialize script
    mail_content_file = "/root/scripts/mail_content_{0}".format(str_current_date)
    print mail_content_file

    ### Settings to connect to BDR cluster
    #This is a one time setup
    cm_host = "cm_host"
    cm_port = "7180"
    cm_login = "******"
    cm_password = "******"
    bdr_cluster_name = "your backup cluster name" 

    #This program takes one parameter called limit, which limits the most recent N instances of a job to be reported
    #to get only the most recent run set Limit to 1
    limit = 1
    if len(argv) == 1:
        usage = 'Usage: %s <limit>' % (argv[0])
        print usage
        quit(1)
    elif len(argv) == 2:
        if argv[1].isdigit():
            limit = argv[1]
        else:
            limit = 7
    else:
        limit = 1
    print 'Limit: %s' % (str(limit))

    #These variables are used later in a loop
    bdr_cluster = None
    hdfs_service = None
    hive_service = None

    ### Connect to CM
    print "\nConnecting to Cloudera Manager at " + cm_host + ":" + cm_port
    api = ApiResource(server_host=cm_host, server_port=cm_port, username=cm_login, password=cm_password)

    ### Get BDR Cluster
    clusters = api.get_all_clusters()
    for cluster in clusters:
        if cluster.displayName == bdr_cluster_name:
            bdr_cluster = cluster
            break
        if bdr_cluster is None:
            print "Error: Cluster '" + bdr_cluster_name + "' not found"
            quit(1)

    ### Get Hive Service
    service_list = bdr_cluster.get_all_services()
    for service in service_list:
        if service.type == "HIVE":
            hive_service = service
            break
    if hive_service is None:
        print "Error: Could not locate Hive Service"
        quit(1)

    ### Get HDFS Service
    service_list = bdr_cluster.get_all_services()
    for service in service_list:
        if service.type == "HDFS":
            hdfs_service = service
            break
    if hdfs_service is None:
        print "Error: Could not locate HDFS Service"
        quit(1)

    #open the mail content file for writing
    fp = open(mail_content_file, 'w')

    ### Begin: Hive Replication
    formatted_str = "\n### Begin: Hive replications ###".format()
    print formatted_str
    fp.write(formatted_str)
    #header format for hive replication
    #Status	StartTime	EndTime	Database	Message
    formatted_str = "\nStatus\tStart\tEnd\tDB\tMessage".format()
    print formatted_str
    fp.write(formatted_str)

    schedules = hive_service.get_replication_schedules()

    ## Iterate through all replication schedules
    for schedule in schedules:
        ## Get the Hive Replication Arguments
        hive_args = schedule.hiveArguments
        replicate_data = hive_args.replicateData  

        ## Get the HDFS Replication Arguments for the Hive job
        if replicate_data:
            hdfs_args = hive_args.hdfsArguments

        ## get the replication schedule ID
        id = str(schedule.id)

        ## Get the history of commands for the scheduled Hive replication
        command_history = hive_service.get_replication_command_history(schedule_id=schedule.id, limit=limit, view='full')

        ## for each replication command for this schedule
        for command in command_history:
            if command.hiveResult is None:
                continue
            hive_result =  command.hiveResult
            if hive_result.tables is None:
                continue
            tables = hive_result.tables
            database_name = ''
            for table in tables:
                database_name = table.database
                break
            start_time = command.startTime.strftime(fmt)

            result_message = ''
            if command.resultMessage:
                result_message = command.resultMessage

            if command.active:
                formatted_str = "\nRunning\t{0}\t{1}\t\t{2}".format(start_time, database_name, result_message)
                print formatted_str
                fp.write(formatted_str)
            else:
                end_time = command.endTime.strftime(fmt)
                if not command.success:
                    formatted_str = "\n****Failed\t{0}\t{1}\t{2}\t\t{3}".format(start_time, end_time, database_name, result_message)
                    print formatted_str
                    fp.write(formatted_str)
                else:
                    formatted_str = "\nSucceeded\t{0}\t{1}\t{2}\t\t{3}".format(start_time, end_time, database_name, result_message)
                    print formatted_str
                    fp.write(formatted_str)

    ##############################
    ### End: Hive replications ###
    ##############################

    ### Begin: HDFS Replication
    formatted_str = "\n\n### Begin: HDFS replications ###".format()
    print formatted_str
    fp.write(formatted_str)
    #header format for hdfs replication
    #Status	StartTime	EndTime	HDFS_Path	Message	Files_Expected	Files_Copied	Files_Skipped	Files_Failed
    formatted_str = "\nStatus\tStart\tEnd\tPath\tMessage\tFiles Expected\tFiles Copied\tFiles Skipped\tFiles Failed".format()
    print formatted_str
    fp.write(formatted_str)

    schedules = hdfs_service.get_replication_schedules()

    ### Iterate through all replication schedules
    for schedule in schedules:
        ### Get the HDFS Arguments
        hdfs_args = schedule.hdfsArguments

        ### get the replication schedule ID
        id = str(schedule.id)

        ## Get the history of commands for the scheduled HDFS replication
        command_history = hdfs_service.get_replication_command_history(schedule_id=schedule.id, limit=limit, view='full')
        for command in command_history:
            if command.hdfsResult is None:
                continue
            hdfs_result = command.hdfsResult
            start_time = command.startTime.strftime(fmt)
            source_path = hdfs_args.sourcePath

            numFilesExpected = hdfs_result.numFilesExpected
            numFilesCopied = hdfs_result.numFilesCopied
            numFilesSkipped = hdfs_result.numFilesSkipped
            numFilesCopyFailed = hdfs_result.numFilesCopyFailed

            result_message = ''
            if command.resultMessage:
                result_message = command.resultMessage
            if command.active:
                formatted_str = "\nRunning\t{0}\t{1}\t\t{2}\t{3}\t{4}\t{5}\t{6}".format(start_time, source_path, result_message, str(numFilesExpected), str(numFilesCopied), str(numFilesSkipped), str(numFilesCopyFailed))
                print formatted_str
                fp.write(formatted_str)
            else:
                end_time = command.endTime.strftime(fmt)
                if not command.success:
                    formatted_str = "\n****Failed\t{0}\t{1}\t{2}\t\t{3}\t{4}\t{5}\t{6}\t{7}".format(start_time, end_time, source_path, result_message, str(numFilesExpected), str(numFilesCopied), str(numFilesSkipped), str(numFilesCopyFailed))
                    print formatted_str
                    fp.write(formatted_str)
                else:
                    formatted_str = "\nSucceeded\t{0}\t{1}\t{2}\t\t{3}\t{4}\t{5}\t{6}\t{7}".format(start_time, end_time, source_path, result_message, str(numFilesExpected), str(numFilesCopied), str(numFilesSkipped), str(numFilesCopyFailed))
                    print formatted_str
                    fp.write(formatted_str)

    ##############################
    ### End: HDFS replications ###
    ##############################

    #print the hostname and the current time at the end of report and close the mail content file
    hostname = socket.gethostname()
    formatted_str = "\n\nCurrent Time on {0} is {1}".format(hostname, str_current_datetime)
    print formatted_str
    fp.write(formatted_str)
    fp.close()

    #send email
    from_addr = 'from address'
    to_addr = 'to address'
    mail_subject = 'Report from %s - Daily BDR Status Report %s' % (hostname, str_current_date)
    send_email(from_addr, to_addr, mail_subject, mail_content_file)

    quit(0)
Ejemplo n.º 43
0
class CmCluster(Cluster):

  def __init__(self, host_name, port=None, user="******", password="******",
               cluster_name=None, ssh_user=None, ssh_port=None, ssh_key_file=None,
               use_tls=False):
    # Initialize strptime() to workaround https://bugs.python.org/issue7980. Apparently
    # something in the CM API uses strptime().
    strptime("2015", "%Y")

    Cluster.__init__(self)
    # IMPALA-5455: If the caller doesn't specify port, default it based on use_tls
    if port is None:
      if use_tls:
        port = CM_TLS_PORT
      else:
        port = CM_CLEAR_PORT
    self.cm = CmApiResource(host_name, server_port=port, username=user, password=password,
                            use_tls=use_tls)
    clusters = self.cm.get_all_clusters()
    if not clusters:
      raise Exception("No clusters found in CM at %s" % host_name)
    if cluster_name:
      clusters_by_name = dict((c.name, c) for c in clusters)
      if cluster_name not in clusters_by_name:
        raise Exception(("No clusters named %s found in CM at %s."
            "Available clusters are %s.")
            % (cluster_name, host_name, ", ".join(sorted(clusters_by_name.keys()))))
      self.cm_cluster = clusters_by_name[cluster_name]
    else:
      if len(clusters) > 1:
        raise Exception(("Too many clusters found in CM at %s;"
            " a cluster name must be provided")
            % host_name)
      self.cm_cluster = clusters[-1]

    self.ssh_user = ssh_user
    self.ssh_port = ssh_port
    self.ssh_key_file = ssh_key_file
    self._ssh_client_lock = Lock()
    self._ssh_clients_by_host_name = defaultdict(list)

  def shell(self, cmd, host_name, timeout_secs=DEFAULT_TIMEOUT):
    with self._ssh_client(host_name) as client:
      return client.shell(cmd, timeout_secs=timeout_secs)

  @contextmanager
  def _ssh_client(self, host_name):
    """Returns an SSH client for use in a 'with' block. When the 'with' context exits,
       the client will be kept for reuse.
    """
    with self._ssh_client_lock:
      clients = self._ssh_clients_by_host_name[host_name]
      if clients:
        client = clients.pop()
      else:
        # IMPALA-7460: Insulate this import away from the global context so as to avoid
        # requiring Paramiko unless it's absolutely needed.
        from tests.util.ssh_util import SshClient
        LOG.debug("Creating new SSH client for %s", host_name)
        client = SshClient()
        client.connect(host_name, username=self.ssh_user, key_filename=self.ssh_key_file)
    error_occurred = False
    try:
      yield client
    except Exception:
      error_occurred = True
      raise
    finally:
      if not error_occurred:
        with self._ssh_client_lock:
          self._ssh_clients_by_host_name[host_name].append(client)

  def _init_local_hadoop_conf_dir(self):
    self._local_hadoop_conf_dir = mkdtemp()
    data = StringIO(self.cm.get("/clusters/%s/services/%s/clientConfig"
      % (self.cm_cluster.name, self._find_service("HIVE").name)))
    zip_file = ZipFile(data)
    for name in zip_file.namelist():
      if name.endswith("/"):
        continue
      extract_path = os.path.join(self._local_hadoop_conf_dir, os.path.basename(name))
      with open(extract_path, "w") as conf_file:
        conf_file.write(zip_file.open(name).read())

  def _find_service(self, service_type):
    """Find a service by its CM API service type. An exception will be raised if no
       service is found or multiple services are found. See the CM API documentation for
       more details about the service type.
    """
    services = [s for s in self.cm_cluster.get_all_services() if s.type == service_type]
    if not services:
      raise Exception("No service of type %s found in cluster %s"
          % (service_type, self.cm_cluster.name))
    if len(services) > 1:
      raise Exception("Found %s services in cluster %s; only one is expected."
        % len(services, self.cm_cluster.name))
    return services[0]

  def _find_role(self, role_type, service_type):
    """Find a role by its CM API role and service type. An exception will be raised if
       no roles are found. See the CM API documentation for more details about the
       service and role types.
    """
    service = self._find_service(service_type)
    roles = service.get_roles_by_type(role_type)
    if not roles:
      raise Exception("No roles of type %s found in service %s"
          % (role_type, service.name))
    return roles[0]

  def _init_hdfs(self):
    self._hdfs = Hdfs(self, "hdfs")

  def _init_hive(self):
    hs2 = self._find_role("HIVESERVER2", "HIVE")
    host = self.cm.get_host(hs2.hostRef.hostId)
    config = hs2.get_config(view="full")["hs2_thrift_address_port"]
    self._hive = Hive(self, str(host.hostname), int(config.value or config.default))

  def _init_impala(self):
    self._impala = CmImpala(self, self._find_service("IMPALA"))
Ejemplo n.º 44
0
def install_java_8(region, stack_name):
    # following general protocol for upgrading to JDK 1.8 here:
    # http://www.cloudera.com/content/cloudera/en/documentation/core/v5-3-x/topics/cdh_cm_upgrading_to_jdk8.html
    ec2_conn = create_ec2_connection(region)
    manager_instance = get_manager_instance(ec2_conn, stack_name)
    cluster_instances = (
        get_worker_instances(ec2_conn, stack_name) +
        [manager_instance, get_master_instance(ec2_conn, stack_name)])
    cluster_hosts = [i.ip_address for i in cluster_instances]

    with cm_tunnel_ctx(manager_instance) as local_port:
        # Connect to CM API
        cm_api = ApiResource('localhost', username='******', password='******',
                             server_port=local_port, version=9)
        cloudera_manager = cm_api.get_cloudera_manager()

        # Stop Cloudera Management Service
        print "Stopping Cloudera Management Service"
        mgmt_service = cloudera_manager.get_service()
        mgmt_service.stop().wait()

        # Stop cluster
        print "Stopping the cluster"
        clusters = cm_api.get_all_clusters()
        cluster = clusters.objects[0]
        cluster.stop().wait()

    # Stop all Cloudera Manager Agents
    @parallel
    def stop_cm_agents():
        sudo('service cloudera-scm-agent stop')
    execute(stop_cm_agents, hosts=cluster_hosts)

    # Stop the Cloudera Manager Server
    def stop_cm_server():
        sudo('service cloudera-scm-server stop')
    execute(stop_cm_server, hosts=[manager_instance.ip_address])

    # Cleanup other Java versions and install JDK 1.8
    @parallel
    def swap_jdks():
        sudo('rpm -qa | grep jdk | xargs rpm -e')
        sudo('rm -rf /usr/java/jdk1.6*')
        sudo('rm -rf /usr/java/jdk1.7*')
        run('wget -O jdk-8-linux-x64.rpm --no-cookies --no-check-certificate '
            '--header "Cookie: oraclelicense=accept-securebackup-cookie" '
            'http://download.oracle.com/otn-pub/java/jdk/8u51-b16/'
            'jdk-8u51-linux-x64.rpm')
        sudo('yum install -y jdk-8-linux-x64.rpm')
        append('/home/ec2-user/.bash_profile',
               'export JAVA_HOME=`find /usr/java -name "jdk1.8*"`')
    execute(swap_jdks, hosts=cluster_hosts)

    # Start the Cloudera Manager Server
    def start_cm_server():
        sudo('service cloudera-scm-server start')
    execute(start_cm_server, hosts=[manager_instance.ip_address])

    # Start all Cloudera Manager Agents
    @parallel
    def start_cm_agents():
        sudo('service cloudera-scm-agent start')
    execute(start_cm_agents, hosts=cluster_hosts)

    with cm_tunnel_ctx(manager_instance) as local_port:
        # Connect to CM API
        cm_api = ApiResource('localhost', username='******', password='******',
                             server_port=local_port, version=9)
        cloudera_manager = cm_api.get_cloudera_manager()

        # Start the cluster and the mgmt service
        print "Starting the cluster"
        cluster.start().wait()
        print "Starting the Cloudera Management Service"
        cloudera_manager = cm_api.get_cloudera_manager()
        mgmt_service = cloudera_manager.get_service()
        mgmt_service.start().wait()
Ejemplo n.º 45
0
def main():
    api = ApiResource(host='r2341-d5-us01', user='******', password='******')
    #is get cluster None?
    what = api.get_all_clusters()
    print "what:", what
Ejemplo n.º 46
0
    for s in api.get_all_clusters()[0].get_all_services(view='full'):
        if s.type == stype:
            print "name: %s displayName: %s" % (s.name, s.displayName)
            config = s.get_config()[0]
            print config


def print_impala_resource_configs(api, stype):
    for s in api.get_all_clusters()[0].get_all_services(view='full'):
        if s.type == stype:
            print "name: %s displayName: %s" % (s.name, s.displayName)
            config = s.get_config()[0]
            print config


for service in api.get_all_clusters()[0].get_all_services():
    print "** Service name:%s type: %s displayName: %s" % (service.name, service.type, service.displayName)
    config = service.get_config()
    print
    print "Service Config: %s" % str(config)
    print
    for rcg in service.get_all_role_config_groups():
        print "rcg name: %s rcg display name: %s" % (rcg.name, rcg.displayName)
        config = rcg.get_config()
        print
        print "rcg config: %s" % str(config)
    for role in service.get_all_roles():
        print
        print "role name: %s role type: %s" % (role.name, role.type)
        print
        print "role config: %s" % (role.get_config())
Ejemplo n.º 47
0
class CmCluster(Cluster):
    def __init__(self,
                 host_name,
                 port=None,
                 user="******",
                 password="******",
                 cluster_name=None,
                 ssh_user=None,
                 ssh_port=None,
                 ssh_key_file=None,
                 use_tls=False):
        # Initialize strptime() to workaround https://bugs.python.org/issue7980. Apparently
        # something in the CM API uses strptime().
        strptime("2015", "%Y")

        Cluster.__init__(self)
        # IMPALA-5455: If the caller doesn't specify port, default it based on use_tls
        if port is None:
            if use_tls:
                port = CM_TLS_PORT
            else:
                port = CM_CLEAR_PORT
        self.cm = CmApiResource(host_name,
                                server_port=port,
                                username=user,
                                password=password,
                                use_tls=use_tls)
        clusters = self.cm.get_all_clusters()
        if not clusters:
            raise Exception("No clusters found in CM at %s" % host_name)
        if cluster_name:
            clusters_by_name = dict((c.name, c) for c in clusters)
            if cluster_name not in clusters_by_name:
                raise Exception(("No clusters named %s found in CM at %s."
                                 "Available clusters are %s.") %
                                (cluster_name, host_name, ", ".join(
                                    sorted(clusters_by_name.keys()))))
            self.cm_cluster = clusters_by_name[cluster_name]
        else:
            if len(clusters) > 1:
                raise Exception(
                    ("Too many clusters found in CM at %s;"
                     " a cluster name must be provided") % host_name)
            self.cm_cluster = clusters[-1]

        self.ssh_user = ssh_user
        self.ssh_port = ssh_port
        self.ssh_key_file = ssh_key_file
        self._ssh_client_lock = Lock()
        self._ssh_clients_by_host_name = defaultdict(list)

    def shell(self, cmd, host_name, timeout_secs=DEFAULT_TIMEOUT):
        with self._ssh_client(host_name) as client:
            return client.shell(cmd, timeout_secs=timeout_secs)

    @contextmanager
    def _ssh_client(self, host_name):
        """Returns an SSH client for use in a 'with' block. When the 'with' context exits,
       the client will be kept for reuse.
    """
        with self._ssh_client_lock:
            clients = self._ssh_clients_by_host_name[host_name]
            if clients:
                client = clients.pop()
            else:
                LOG.debug("Creating new SSH client for %s", host_name)
                client = SshClient()
                client.connect(host_name,
                               username=self.ssh_user,
                               key_filename=self.ssh_key_file)
        error_occurred = False
        try:
            yield client
        except Exception:
            error_occurred = True
            raise
        finally:
            if not error_occurred:
                with self._ssh_client_lock:
                    self._ssh_clients_by_host_name[host_name].append(client)

    def _init_local_hadoop_conf_dir(self):
        self._local_hadoop_conf_dir = mkdtemp()
        data = StringIO(
            self.cm.get(
                "/clusters/%s/services/%s/clientConfig" %
                (self.cm_cluster.name, self._find_service("HIVE").name)))
        zip_file = ZipFile(data)
        for name in zip_file.namelist():
            if name.endswith("/"):
                continue
            extract_path = os.path.join(self._local_hadoop_conf_dir,
                                        os.path.basename(name))
            with open(extract_path, "w") as conf_file:
                conf_file.write(zip_file.open(name).read())

    def _find_service(self, service_type):
        """Find a service by its CM API service type. An exception will be raised if no
       service is found or multiple services are found. See the CM API documentation for
       more details about the service type.
    """
        services = [
            s for s in self.cm_cluster.get_all_services()
            if s.type == service_type
        ]
        if not services:
            raise Exception("No service of type %s found in cluster %s" %
                            (service_type, self.cm_cluster.name))
        if len(services) > 1:
            raise Exception(
                "Found %s services in cluster %s; only one is expected." %
                len(services, self.cm_cluster.name))
        return services[0]

    def _find_role(self, role_type, service_type):
        """Find a role by its CM API role and service type. An exception will be raised if
       no roles are found. See the CM API documentation for more details about the
       service and role types.
    """
        service = self._find_service(service_type)
        roles = service.get_roles_by_type(role_type)
        if not roles:
            raise Exception("No roles of type %s found in service %s" %
                            (role_type, service.name))
        return roles[0]

    def _init_hdfs(self):
        self._hdfs = Hdfs(self, "hdfs")

    def _init_hive(self):
        hs2 = self._find_role("HIVESERVER2", "HIVE")
        host = self.cm.get_host(hs2.hostRef.hostId)
        config = hs2.get_config(view="full")["hs2_thrift_address_port"]
        self._hive = Hive(self, str(host.hostname),
                          int(config.value or config.default))

    def _init_impala(self):
        self._impala = CmImpala(self, self._find_service("IMPALA"))
Ejemplo n.º 48
0
def main():
  api = ApiResource('r2341-d5-us01',username='******',password='******')
  clusters = api.get_all_clusters()
  print "clusters:", clusters
  if len(clusters) == 0: 
    print "none"
Ejemplo n.º 49
0
class APIClient:
    def __init__(self, cm_host, cm_user, cm_pass, cluster_name=None):
        self.SERVICE_HIVE = 'HIVE'
        self.SERVICE_HUE = 'HUE'
        self.SERVICE_IMPALA = 'IMPALA'
        self.SERVICE_SOLR = 'SOLR'
        self.SERVICE_YARN = 'YARN'
        self.SERVICE_HDFS = 'HDFS'
        self.SERVICE_HBASE = 'HBASE'
        self.SERVICE_ZK = 'ZOOKEEPER'
        self.SERVICE_SENTRY = 'SENTRY'

        self.api = ApiResource(
            cm_host,
            username=cm_user,
            password=cm_pass,
        )

        self.cluster = None
        self.services = {}
        for c in self.api.get_all_clusters():
            if cluster_name is None or cluster_name == c.name:
                self.cluster = c
                break

        for service in self.cluster.get_all_services():
            self.services[service.type] = service

    @staticmethod
    def get_api_client(cluster_name, cm_host, cm_user, cm_pass):
        return APIClient(cm_host, cm_user, cm_pass, cluster_name=cluster_name)

    def has_sentry(self):
        """
        This function checks if sentry service is available in the cluster
        :return: boolean
        """
        return self.SERVICE_SENTRY in self.services

    def has_hbase(self):
        """
        This function checks if hbase service is available in the cluster
        :return: boolean
        """
        return self.SERVICE_HBASE in self.services

    def get_impala_service(self):
        """
        This function checks if sentry service is available in the cluster
        :return: boolean
        """
        if self.SERVICE_IMPALA in self.services:
            return self.services[self.SERVICE_IMPALA]

        return None

    def get_hiveserver2_service(self):
        """
        This function returns the hiveserver2 service instance
        :return: boolean
        """
        if self.SERVICE_HIVE in self.services:
            return self.services[self.SERVICE_HIVE]

        return None

    def get_hbase_service(self):
        """
        This function returns the hbase service instance
        :return: boolean
        """
        if self.SERVICE_HBASE in self.services:
            return self.services[self.SERVICE_HBASE]

        return None

    def get_hdfs_service(self):
        """
        This function returns the hdfs service instance
        :return: boolean
        """
        if self.SERVICE_HDFS in self.services:
            return self.services[self.SERVICE_HDFS]

        return None

    def enable_sentry(self):
        service_list = [
            self.SERVICE_HIVE, self.SERVICE_IMPALA, self.SERVICE_YARN,
            self.SERVICE_HUE
        ]

        for s_name in service_list:
            if s_name in self.services:
                className = s_name.capitalize() + "APIClient"
                module = importlib.import_module("api." + className)
                class_ = getattr(module, className)
                client = class_(self.services[s_name])
                client.enable_sentry()

    def enable_kerberos(self):
        service_list = [
            self.SERVICE_HDFS, self.SERVICE_ZK, self.SERVICE_HBASE,
            self.SERVICE_SOLR
        ]

        for s_name in service_list:
            if s_name in self.services:
                className = s_name.capitalize() + "APIClient"
                module = importlib.import_module("api." + className)
                class_ = getattr(module, className)
                client = class_(self.services[s_name])
                client.enable_kerberos()

    def enable_impala_vip(self, host):
        impala_service = self.get_impala_service()
        ImpalaAPIClient(impala_service).enable_load_balancer(host)

    def disable_impala_vip(self):
        impala_service = self.get_impala_service()
        ImpalaAPIClient(impala_service).disable_load_balancer()

    def hiveserver2_create_role(self, host, i):
        hive_service = self.get_hiveserver2_service()
        HiveAPIClient(hive_service).add_hs2_role(host, i)

    def enable_hive_vip(self, host):
        service = self.get_hiveserver2_service()
        HiveAPIClient(service).enable_load_balancer(host)

    def disable_hive_vip(self):
        service = self.get_hiveserver2_service()
        HiveAPIClient(service).disable_load_balancer()

    def enable_hbase_authorization(self):
        service = self.get_hbase_service()
        HbaseAPIClient(service).enable_authorization()

    def enable_sentry_hdfs_sync(self, prefixes):
        service = self.get_hdfs_service()
        HdfsAPIClient(service).sentry_sync(prefixes)
Ejemplo n.º 50
0
def main():
  api = ApiResource(host='r2341-d5-us01',user='******',password='******')
  #is get cluster None? 
  what = api.get_all_clusters()
  print "what:", what
Ejemplo n.º 51
0
def do_call(user, password, man_host, man_port, cluster_name, parcel_name,
            parcel_version, parcel_repo, init_pre_dir, init_post_dir):
    api = ApiResource(man_host, man_port, user, password, False,
                      MAN_API_VERSION)
    if not parcel_repo.endswith('/'):
        parcel_repo += '/'
    if re.match(REGEX_VERSION, parcel_version) is None or re.match(
            REGEX_VERSION, parcel_version).group() != parcel_version:
        raise Exception('Parcel [' + parcel_name +
                        '] is qualified by invalid version [' +
                        parcel_version +
                        '] expected to match regular expression [' +
                        REGEX_VERSION + ']')
    if not parcel_repo.endswith(parcel_version + '/'):
        raise Exception('Parcel [' + parcel_name +
                        '] is qualified by invalid version [' +
                        parcel_version + '] when compared with repository [' +
                        parcel_repo + ']')
    cm_config = api.get_cloudera_manager().get_config(view='full')
    repo_config = cm_config['REMOTE_PARCEL_REPO_URLS']
    repo_list = repo_config.value or repo_config.default
    if parcel_repo not in repo_list:
        repo_list += ',' + parcel_repo
        api.get_cloudera_manager().update_config(
            {'REMOTE_PARCEL_REPO_URLS': repo_list})
        time.sleep(
            POLL_SEC
        )  # The parcel synchronize end-point is not exposed via the API, so sleep instead
    cluster_names = []
    if cluster_name is None:
        for cluster in api.get_all_clusters():
            cluster_names.append(cluster.name)
    else:
        cluster_names.append(cluster_name)
    for cluster_name_itr in cluster_names:
        print 'Cluster [DEPLOYMENT] starting ... '
        cluster = api.get_cluster(cluster_name_itr)
        parcel = cluster.get_parcel(parcel_name, parcel_version)
        parcel_already_activated = False
        print 'Parcel [DEPLOYMENT] starting ... '
        if parcel.stage == 'ACTIVATED':
            parcel_already_activated = True
            print 'Parcel [DEPLOYMENT] already deployed'
        else:
            do_parcel_op(cluster, parcel_name, parcel_version, 'DOWNLOAD',
                         'AVAILABLE_REMOTELY', 'DOWNLOADED', 'start_download')
            do_parcel_op(cluster, parcel_name, parcel_version, 'DISTRIBUTE',
                         'DOWNLOADED', 'DISTRIBUTED', 'start_distribution')
            do_parcel_op(cluster, parcel_name, parcel_version, 'ACTIVATE',
                         'DISTRIBUTED', 'ACTIVATED', 'activate')
            parcel = cluster.get_parcel(parcel_name, parcel_version)
            if parcel.stage != 'ACTIVATED':
                raise Exception('Parcel is currently mid-stage [' +
                                parcel.stage +
                                '], please wait for this to complete')
        print 'Parcel [DEPLOYMENT] finished'
        if init_pre_dir is not None and os.path.isdir(init_pre_dir):
            print 'Cluster [PRE_INIT] starting ... '
            for script in glob.glob(init_pre_dir + '/*.sh'):
                subprocess.call([script])
            print 'Cluster [PRE_INIT] finished'
        if not parcel_already_activated:
            print 'Cluster [CONFIG_DEPLOYMENT] starting ... '
            cluster.deploy_client_config()
            cmd = cluster.deploy_client_config()
            if not cmd.wait(TIMEOUT_SEC).success:
                raise Exception('Failed to deploy client configs')
            print 'Cluster [CONFIG_DEPLOYMENT] finished'
            print 'Cluster [RESTART] starting ... '
            for service in cluster.get_all_services():
                if service.type == 'FLUME':
                    service.restart().wait()
                if service.type == 'HIVE':
                    service.restart().wait()
                if service.type == 'YARN':
                    service.restart().wait()
            print 'Cluster [RESTART] finished'
        if init_post_dir is not None and os.path.isdir(init_post_dir):
            print 'Cluster [POST_INIT] starting ... '
            for script in glob.glob(init_post_dir + '/*.sh'):
                subprocess.call([script])
            print 'Cluster [POST_INIT] finished'
        print 'Cluster [DEPLOYMENT] finished'
  'AD_KDC_DOMAIN':        kerberos_ad_ou,
  'KDC_HOST':             kerberos_ad_server,
  'KDC_TYPE':             'Active Directory',
  'KRB_MANAGE_KRB5_CONF': 'true',
  'KRB_ENC_TYPES':        'aes256-cts',
  'SECURITY_REALM':       kerberos_ad_realm
})

print 'Import KDC credentials'
cmd = cm.import_admin_credentials(kerberos_cm_principal, krb_pwd).wait()
if not cmd.success:
  raise Exception('Command %s failed (%s)' % (cmd.name, cmd.resultMessage))

print 'Configure Kerberos for cluster services'
if api_version >= 11:
  for cluster in api.get_all_clusters():
    cmd = cluster.configure_for_kerberos(1004, 1006).wait()
    if not cmd.success:
      raise Exception('Command %s failed (%s)' % (cmd.name, cmd.resultMessage))
else:
  CFG = yaml.load('''
  ZOOKEEPER:
    config:
      enableSecurity: true
  HDFS:
    config:
      hadoop_security_authentication: kerberos
      hadoop_security_authorization: true
    roleConfigGroups:
      DATANODE:
        dfs_datanode_data_dir_perm: 700