def get_cluster_scheduler(self): """ 获取hadoop 集群信息 :param file: 输出文件保存路径 """ url = self.hadoop_url + "scheduler" scheduler_file = os.path.join(self.file_path, "scheduler.csv") scheduler_file2 = os.path.join(self.file_path, "scheduler2.csv") try: results = urlopen(url, timeout=2000).read() results = json.loads(results) results = results['scheduler']['schedulerInfo']['queues']['queue'] print(self.memcpu_info) for scheduler_info in results: results_copy = scheduler_info.copy() for key, value in results_copy['resourcesUsed'].items(): scheduler_info[key] = value / self.memcpu_info[key] except KeyError as error: logger.error("key error {0}".format(error)) except Exception as error: logger.error(error) write_header = True if FileOperator.file_exits(scheduler_file): write_header = False headers = results[0].keys() FileOperator.write_to_csv(results, scheduler_file, headers=headers, write_header=write_header, model="a+") FileOperator.write_to_csv(results, scheduler_file2, headers=headers, write_header=write_header,model="w+")
def get_cluster_information(self): """ get cluster infromation """ url = self.hadoop_url + "metrics" write_header = True cluster_file = os.path.join(self.file_path, "cluster.csv") cluster_file2 = os.path.join(self.file_path, "cluster2.csv") if FileOperator.file_exits(cluster_file): write_header = False try: results = urlopen(url, timeout=2000).read() results = [json.loads(results)["clusterMetrics"]] except Exception as error: logger.error(error) self.memcpu_info["memory"] = results[0].get('totalMB', 0) self.memcpu_info["vCores"] = results[0].get('totalVirtualCores', 0) self.get_cluster_scheduler() headers = results[0].keys() FileOperator.write_to_csv(results, cluster_file, headers=headers, write_header=write_header,model="a+") FileOperator.write_to_csv(results, cluster_file2, headers=headers, model="w")
def get_sparkjobs_information(self, applications): """ get each application's jobs information :param applications: list contains applications information """ app_jobs = [] spark_job_file = os.path.join(self.file_path, "sparkjob.json") self.job_metrics = self.job_metrics.replace("\n", "").split(',') for application_items in applications: application_id = application_items["id"] application_rest_url = self.application_url + application_id + "/1/jobs" try: application_jobs_list = HadoopUtil.request_url(application_rest_url) application_jobs_list = json.loads(application_jobs_list) except urlerror: logger.warning("this application {0} is not " "a spark type application".format(application_items["id"])) else: for applications in application_jobs_list: apps = {key: value for key, value in applications.items() for applications in application_jobs_list if key in self.job_metrics} app_jobs.append(dict(apps, **application_items)) headers = app_jobs[0].keys() FileOperator.write_to_json(app_jobs, spark_job_file) FileOperator.write_to_csv(app_jobs, spark_job_file, headers=headers)
def get_scheduler_info(self, running_application): logger.info("start get_scheduler_info") apps = running_application.copy(deep=True) apps = apps.groupby('queue')['allocatedMB', 'allocatedVCores'].sum() apps['queueName'] = apps.index apps.insert(0, 'totalMemory', self.memcpu_info['memory']) apps.insert(0, 'totalCpu', self.memcpu_info['vCores']) apps.insert(0, 'memory', apps['allocatedMB'] / apps['totalMemory']) apps.insert(0, 'vCores', apps['allocatedVCores'] / apps['totalCpu']) scheduler_file = os.path.join(self.file_path, "scheduler_summary.csv") write_header = True if FileOperator.file_exits(scheduler_file): write_header = False apps.to_csv(scheduler_file, header=write_header, index=False, mode="a+") logger.info("start get_cluster_scheduler") url = self.hadoop_url + "scheduler" scheduler_file2 = os.path.join(self.file_path, "scheduler_metric.csv") results = urlopen(url, timeout=2000).read() results = json.loads(results) results = results['scheduler']['schedulerInfo']['queues']['queue'] headers = results[0].keys() for j in results: if j.has_key('queues'): del j['queues'] FileOperator.write_to_csv(results, scheduler_file2, headers=headers, model="w+")
def get_applications_information(self, query_parametes=None): """ :param query_parametes: dict 过滤条件,默认为成功执行完成 默认搜索所有 * state [deprecated] - state of the application * states - applications matching the given application states, specified as a comma-separated list. * finalStatus - the final status of the application - reported by the application itself * user - user name * queue - queue name * limit - total number of app objects to be returned * startedTimeBegin - applications with start time beginning with this time, specified in ms since epoch * startedTimeEnd - applications with start time ending with this time, specified in ms since epoch * finishedTimeBegin - applications with finish time beginning with this time, specified in ms since epoch * finishedTimeEnd - applications with finish time ending with this time, specified in ms since epoch * applicationTypes - applications matching the given application types, specified as a comma-separated list. * applicationTags - applications matching any of the given application tags, specified as a comma-separated list. :param file: 输出文件保存位置 example: query_parametes = {"finalStaus": "SUCCEEDED"} get_job(query_parametes=query_parametes) """ hadoop_rest_url = self.hadoop_url + "apps?" app_file = os.path.join(self.file_path, "app.csv") try: for key, value in query_parametes.items(): hadoop_rest_url += key + "=" + str(value) + "&" except AttributeError: logger.warn("didn't get any query_parametes, so ,collect all apps") json_result = HadoopUtil.request_url(hadoop_rest_url) try: list_result = json.loads(json_result)['apps']['app'] headers = list_result[0].keys() except KeyError as error: logger.error("key error {0}".format(error)) except TypeError: logger.warn("dit not get any data from parameters " "{0}".format(query_parametes)) except Exception as error: logger.error(error) else: FileOperator.write_to_csv(list_result, app_file, headers=headers) self.get_sparkjobs_information(list_result)
def get_commonjobs_information(self): commonjob_file = os.path.join(self.file_path, "commonjob.csv") result = HadoopUtil.request_url(self.job_url) result = json.loads(result)["jobs"] if not result: return result = result["job"] headers = result[0].keys() FileOperator.write_to_csv(result, commonjob_file, headers=headers)
def get_cluster_information(self): logger.info("start get_cluster_information") url = self.hadoop_url + "metrics" write_header = True cluster_file = os.path.join(self.file_path, "cluster.csv") if FileOperator.file_exits(cluster_file): write_header = False results = urlopen(url, timeout=2000).read() results = [json.loads(results)["clusterMetrics"]] self.memcpu_info["memory"] = results[0].get('totalMB', 0) self.memcpu_info["vCores"] = results[0].get('totalVirtualCores', 0) headers = results[0].keys() FileOperator.write_to_csv(results, cluster_file, headers=headers, write_header=write_header, model="a+") self.get_applications_information()