def alarm_load(): try: loging.write("start %s ......" %alarm_load.__name__) whitelist = [] dict_load = defaultdict() db_server = db_idc.idc_servers db_zabbix = db_idc.zabbix_info db_project = db_op.project_list db_project_other = db_op.project_other Influx_cli = InfluxDBClient(influxdb_host, influxdb_port, influxdb_user, influxdb_pw, 'zabbix_infos') host_infos = db_zabbix.query.with_entities(db_zabbix.ip, db_zabbix.ssh_port,db_zabbix.hostname,db_zabbix.update_time).filter(and_(db_zabbix.cpu_load > 100, db_zabbix.icmpping == 1)).all() Key = "op_alarm_load_whitelist" if RC_CLUSTER.exists(Key): whitelist = RC_CLUSTER.smembers(Key) #循环监控疑似问题服务器 for infos in host_infos: host,ssh_port,hostname,update_time=infos if time.strftime('%Y-%m-%d',time.localtime()) in update_time: try: if not host.startswith('172.16.19.'): now_time = datetime.datetime.now() dt = now_time - datetime.timedelta(minutes=10) dt = dt.strftime('%Y-%m-%dT%H:%M:%SZ') cmd = "select mean(*) from server_infos where time >='%s' group by hostname" % dt results = Influx_cli.query(cmd) if results: for key in results.keys(): if hostname == key[-1]['hostname']: for infos in results[key]: if infos['mean_cpu_load'] >100: dict_load[hostname] = (host,ssh_port,int(infos['mean_cpu_load'])) except Exception as e: logging.error(e) continue #进行重启操作 if dict_load: for hostname in dict_load: host,ssh_port,cpu_load = dict_load[hostname] # 判断ssh是否可以登录 try: Ssh = SSH.ssh(ip=host,ssh_port=ssh_port) except Exception as e: if not hostname.startswith('nj'): Ssh_Key = "op_ssh_login_fail_%s" %hostname RC.incr(Ssh_Key,1) RC.expire(Ssh_Key,350) if int(RC.get(Ssh_Key)) >5: tools.dingding_msg(text,token=ops_token) else: tools.dingding_msg(text) else: try: Key = 'op_alarm_load_%s' % hostname Project = None RC_CLUSTER.incr(Key, 5) RC_CLUSTER.expire(Key, 600) ctime = int(RC_CLUSTER.get(Key)) if hostname not in whitelist: #筛查可重启服务进程 results = Ssh.Run("ps -aux | sort -k3nr |head -n 1") if results['stdout']: results = results['stdout'][0].strip().split() try: if results[-1].endswith('-rpc.jar'): pro_jar = results[-1] if pro_jar in ['moji-location-rpc.jar']: Project =pro_jar.split('.')[0] else: for line in results: if '-Dcatalina.home=' in line : Project = line.strip().split('/')[-1] break except Exception as e: logging.error(e) if Project: try: # 判断是否是tomcat项目 ret = db_project.query.filter(and_(db_project.ip == host, db_project.ssh_port == ssh_port)).all() if ret: #重启问题tomcat result = Ssh.Run("supervisorctl restart {0}".format(Project)) if result['stderr']: text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load), "相关进程:{0}".format(Project), '**服务重启失败,需手动处理!**'] else: text = ['**线上服务重启:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load), "相关进程:{0}".format(Project), '**服务重启成功!**'] ops_token = None else: # 判断是否是jar项目 server_id = db_server.query.with_entities(db_server.id).filter(db_server.hostname==hostname).all() if server_id[0]: ret = db_project_other.query.filter(db_project_other.server_id == int(server_id[0][0])).all() if ret: text = ['**线上服务器预警:%s**' % hostname, "CPU持续{0}分钟平均使用率:{1}%".format(ctime,cpu_load), "相关进程:{0}".format(Project), '**请及时进行处理!**'] if text and not hostname.startswith('nj'): tools.dingding_msg(text,ops_token) except Exception as e: logging.error(e) finally: Ssh.Close() finally: loging.write("%s complete!" % alarm_load.__name__) db_idc.DB.session.remove() db_op.DB.session.remove()
def get_other_info(): db_project_other = db_op.project_other db_crontabs = db_idc.crontabs db_servers = db_idc.idc_servers db_hosts = db_idc.hosts infos = db_servers.query.with_entities(db_servers.id,db_servers.ip,db_servers.ssh_port).filter(and_(db_servers.status !='维护中',db_servers.comment !='跳过')).all() try: for info in infos: server_id,ip,ssh_port = info if tcpping(host=ip, port=ssh_port, timeout=3): try: Ssh = SSH.ssh(ip=ip, ssh_port=ssh_port) except: continue else: try: update_date = time.strftime('%Y-%m-%d', time.localtime()) #收集crontab信息 results = Ssh.Run("ls /var/spool/cron/") if results['stdout']: for user in results['stdout']: user = user.strip() results = Ssh.Run("cat /var/spool/cron/%s" %user) if results['stdout']: v = db_crontabs.query.filter(db_crontabs.server_id==int(server_id)).all() for c in v: db_idc.DB.session.delete(c) db_idc.DB.session.commit() for result in results['stdout']: if not result.startswith('#') and '*' in result: result = result.strip().split() cron = ' '.join(result[:5]) action = ' '.join(result[5:]) c = db_crontabs(cron=cron,user=user,action=action,server_id=int(server_id),update_time=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() except Exception as e: logging.error(e) # 收集jar运行信息 try: results = Ssh.Run("ps -ef|grep java|grep -e '.jar$'") if results['stdout']: vals = [] v = db_project_other.query.filter(db_project_other.server_id == int(server_id)).all() for c in v: db_op.DB.session.delete(c) db_op.DB.session.commit() for result in results['stdout']: if 'hadoop' not in result and 'hive' not in result: result = result.strip().split()[-1] if '/' in result: result = result.split('/')[-1] vals.append(result) for val in set(vals): result = db_project_other.query.filter(and_(db_project_other.project==val,db_project_other.server_id==server_id)).all() if not result: business_id = 0 business = db_project_other.query.with_entities(db_project_other.business_id).filter(and_(db_project_other.project == val,db_project_other.business_id != 0)).all() if business: business_id = business[0][0] c = db_project_other(lable='java', project=val, server_id=server_id,business_id=business_id, update_time=update_date) db_op.DB.session.add(c) db_op.DB.session.commit() except Exception as e: logging.error(e) #收集hosts信息 try: results = Ssh.Run("cat /etc/hosts") if results['stdout']: v = db_hosts.query.filter(db_hosts.server_id == int(server_id)).all() for c in v: db_idc.DB.session.delete(c) db_idc.DB.session.commit() for line in results['stdout']: if not line.startswith('#') and '127.0.0.1' not in line: line = line.strip().split() if line: if len(line) == 2: if 'localhost' not in line[1]: c = db_hosts(host=line[0],hostname=line[1],server_id=server_id,update_time=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() if len(line) > 2: for hostname in line[1:]: if not hostname.startswith('#') and not 'localhost' in hostname: c = db_hosts(host=line[0],hostname=hostname, server_id=server_id, update_time=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() except Exception as e: logging.error(e) Ssh.Close() except Exception as e: logging.error(e) finally: db_idc.DB.session.remove() db_op.DB.session.remove()
def Redis_info(info): ip,ssh_port,app_port = info #初始化参数 masterauth = None requirepass = None pid = None conf_dir = None conf_file = "" redis_type = {'master': '否', 'slave': '否', 'cluster': '否'} #判断ssh端口是否连通 if tcpping(host=ip, port=app_port, timeout=3): try: Ssh = SSH.ssh(ip=ip, ssh_port=ssh_port) except: pass else: cmd = "netstat -lntp|grep :%s" % app_port results = Ssh.Run(cmd) if results['stdout']: for line in results['stdout'][0].split(): if '/redis' in line: pid = line.split('/')[0] break if pid: cmd = "/bin/ps -ef|grep -v grep|grep {}".format(pid) results = Ssh.Run(cmd) if results['stdout']: result = results['stdout'][0] if 'cluster' in result: redis_type['cluster'] = '是' else: try: result = results['stdout'][0].split()[-1] if '/' in result: conf_file = "/usr/local/moji/redis/etc/{}".format(result.split('/')[-1]) if not conf_file.endswith('.conf'): cmd = "lsof -p {}|grep 'cwd'".format(pid) cwd = Ssh.Run(cmd) if cwd['stdout']: for line in cwd['stdout']: if 'redis' in line: conf_dir = line.split()[-1] break if conf_dir: cmd = "grep {0} -r {1}/|grep '.conf:port'".format(app_port, conf_dir) results = Ssh.Run(cmd) if results['stdout']: for line in results['stdout']: if ':port {}'.format(app_port) in line: conf_file = line.split(':')[0] if conf_file.endswith('.conf'): cmd = "grep masterauth {}".format(conf_file) results = Ssh.Run(cmd) if results['stdout']: masterauth = results['stdout'][0].split()[-1].strip() cmd = "grep requirepass {}".format(conf_file) pw_result = Ssh.Run(cmd) if pw_result['stdout']: requirepass = pw_result['stdout'][0].split()[-1].strip() RC = redis.StrictRedis(ip, int(app_port),decode_responses=True) if requirepass: RC = redis.StrictRedis(ip,int(app_port),password=requirepass,decode_responses=True) Infos = RC.info() if Infos['role'] == 'master': redis_type['master'] = '是' if Infos['role'] == 'slave': redis_type['slave'] = '是' counts = int((Infos['connected_slaves'])) except: pass else: try: #修改记录slave信息 if counts > 0: for i in range(counts): Info = Infos['slave%s' % i] if isinstance(Info,dict): slave_ip = Info['ip'] slave_port = Info['port'] slave_status = Info['state'] else: slave_ip, slave_port, slave_status = Info.split(',') if slave_status == 'online' and int(slave_port) >1024: try: SSH_port = ssh_ports['%s:%s' % (slave_ip, slave_port)] server_id = server_ids['%s:%s' %(slave_ip,SSH_port)] except: server_id = slave_ip servers = db_servers.query.with_entities(db_servers.ip).filter(db_servers.s_ip.like('%{0};%'.format(slave_ip))).all() if servers: for server in servers: val = db_third.query.filter(and_(db_third.ip==server[0],db_third.app_port==slave_port)).all() if val: SSH_port = ssh_ports['%s:%s' % (server[0], slave_port)] server_id = server_ids['%s:%s' %(server[0],SSH_port)] break try: master_id = server_ids['%s:%s' % (ip, ssh_port)] except: master_id = ip val = db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == slave_port)).all() if val: db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == slave_port)).update( {db_redis.masterauth: masterauth, db_redis.requirepass: requirepass, db_redis.master: '否',db_redis.slave: '是',db_redis.cluster: '否', db_redis.Master_Host: master_id,db_redis.Master_Port: app_port,db_redis.update_date: update_date}) db_idc.DB.session.commit() else: c = db_redis(server_id=server_id, port=slave_port, masterauth=masterauth, requirepass=requirepass, master='否', slave='是',cluster='否', Master_host=master_id, Master_Port=app_port, update_date=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() except: db_idc.DB.session.rollback() try: #修改记录master或者cluster信息 if redis_type['master'] == '是' or redis_type['cluster'] == '是': try: server_id = server_ids['%s:%s' % (ip, ssh_port)] except: server_id = ip servers = db_servers.query.with_entities(db_servers.ip).filter(db_servers.s_ip.like('%{0};%'.format(ip))).all() if servers: for server in servers: val = db_third.query.filter(and_(db_third.ip == server[0],db_third.app_port == app_port)).all() if val: server_id = server_ids['%s:%s' % (server[0], ssh_port)] break val = db_redis.query.filter(and_(db_redis.server_id == server_id, db_redis.port == app_port)).all() if val: db_redis.query.filter( and_(db_redis.server_id == server_id, db_redis.port == app_port)).update( {db_redis.masterauth: masterauth, db_redis.requirepass: requirepass, db_redis.master: redis_type['master'], db_redis.slave: redis_type['slave'], db_redis.cluster: redis_type['cluster'], db_redis.Master_Host: '', db_redis.Master_Port: '', db_redis.update_date: update_date}) db_idc.DB.session.commit() else: loging.write("add new redis %s %s ......" % (ip, app_port)) c = db_redis(server_id=server_id, port=app_port, masterauth=masterauth, requirepass=requirepass, master=redis_type['master'], slave=redis_type['slave'], cluster=redis_type['cluster'], Master_host='', Master_Port='', update_date=update_date) db_idc.DB.session.add(c) db_idc.DB.session.commit() except: db_idc.DB.session.rollback() finally: Ssh.Close() else: loging.write("delete not exist redis %s %s ......" %(ip,app_port)) v = db_redis.query.filter(and_(db_redis.server_id==server_ids['%s:%s' %(ip,ssh_port)],db_redis.port==app_port)).all() for c in v: db_idc.DB.session.delete(c) db_idc.DB.session.commit() v= db_third.query.filter(and_(db_third.ip==ip,db_third.app_port==app_port)).all() for c in v: db_idc.DB.session.delete(c) db_idc.DB.session.commit()