def get_chd_files(self): ''' Get *.chd files on all nodes, meanwhile identify new generated stripe files since the last time. ''' # Clear list of new stripe files self.new_file_list = [] eval_cmd = 'eval `ssh-agent`;eval `ssh-add ~/.ssh/dpnid`;' for node in self.node_list: ssn_cmd = 'ssn %s \'find /data0?/cur -name "*.chd"\'' % node cmd = eval_cmd + ssn_cmd result = self.ssh.run_command(cmd) if result[2]: LOG.error('Fail to get chd file list on node %s.' % node) raise ExecutionFailed('Fail to get chd file on %s.' % node) for line in result[0].split('\n'): # Ignore other files if not line.endswith('.chd'): continue # Ignore existing stripe files if (node, line) in self.file_list: continue # Add new stripe files into two lists self.file_list.append((node, line)) self.new_file_list.append((node, line))
def getAvamarType(self): """ Get Avamar type, first check whether it's a physical node, if it's a AVE, check config file node.cfg Valid types: gen4/gen4s/gen4t/vmware ave/kvm ave/hyperv ave/azure ave/aws ave """ if not self.__ssh: LOG.error("No ssh connection attached") return "" physical_machine_check_command_list = { "gen4": "cat /proc/cpuinfo | grep -q 5504", "gen4s": "sudo dmidecode | grep -q S2600G", "gen4t": "/sbin/div_cli -r 2>/dev/null| grep -q GEN4T" } try: for physical_type, cmd in physical_machine_check_command_list.items( ): res = self.__ssh.run_command(cmd) if res[2] == 0: return physical_type #If not physical machines, check whether it's AVE ave_type_file = "/usr/local/avamar/etc/node.cfg" res = self.__ssh.run_command("ls %s" % ave_type_file) #Check whether node.cfg exited if res[2] == 0: res = self.__ssh.run_command("cat %s" % ave_type_file) if res[2] == 0: #Try to find AVE type node_type_content = res[0] for line in node_type_content.split("\n"): if re.match("ave type=\s*(.*)", line): ave_type = re.search("ave type=\s*(.*)", line).group(1) return ave_type + " ave" except Exception, e: LOG.error(str(e))
def ssh_run_command_error_quit(ssh, cmd): result = ssh.run_command(cmd) if result[2]: LOG.error("cmd failed:%s" % (cmd)) for i in range(2): LOG.error("result[%d] :%s" % (i, result[i])) raise Exception, "Execute command failed:%s." % (cmd) return result[0]
def create_directory(self, directory): ''' Create directory on remote server. @directory: Directory path. ''' cmd = 'mkdir -p %s' % directory result = self.server_ssh.run_command(cmd) if result[2] != 0: LOG.error('Failed to created directory: %s' % directory) return False LOG.info('Created directory: %s' % directory) return True
def get_latest_cp_attribute(self, attribute): ''' Get the lastest check point attribute. call command "avmaint cpstatus" @attribute: String,The attribute to be searched @output: String,The value of the attribute ''' try: cp = AvCheckPoint(self.ssh) cpstatus = cp.cpstatus() except Exception, e: LOG.error("%s: Failed to get cpstatus:%s" % (self.__classname, str(e))) return False
def unlockcp(self, cp_name): ''' unlock checkpoint @cp_name: String;The check point need to be locked. @output:Boolean;True if succeed ''' try: cp = AvCheckPoint(self.ssh) result = cp.unlockcp(cp_name) except Exception, e: LOG.error("%s: Failed to unlock check points:%s" % (self.__classname, cp_name, str(e))) return False
def perf_restore(self): ''' Extract from Avamar server with some data and calculate restore rate. ''' elapsed_minutes = [] restore_dir = '/home/admin/data_set/restore' sn_list = self.avtar.get_backup_sequence_number( self.server, self.root_user, self.root_password, self.account) for index, sn in enumerate(sn_list[0:self.count]): self.dg.remove_data(restore_dir, force=True) Utils.create_directory(self.client_ssh, restore_dir) log_file = 'avtar_restore_%s.log' % sn log_file = os.path.join(self.log_dir, log_file) res = self.avtar.extract_backup(self.server, self.root_user, self.root_password, self.account, sequence_number=sn, target=restore_dir, options='--logfile=%s' % log_file) if res != 0: LOG.error('Restore file failed, sequence number %s.' % sn) return False elapsed = self.analyze_log(self.client_ssh, log_file) if elapsed is None: LOG.error('Can not get elapsed time from log file.') return False speed = self.calculate_speed(self.size, elapsed) LOG.info('[%s] Restored %s %s in %s minutes: %s GB/hour.' % (index + 1, self.size, self.size_unit, elapsed, speed)) elapsed_minutes.append(elapsed) overall_elapsed = sum(elapsed_minutes) overall_speed = self.calculate_speed(self.total_size, overall_elapsed) self.restore_elapsed = overall_elapsed self.restore_speed = overall_speed self.summary['Restore'] = { 'Type': 'Restore', 'File Size': self.size_str, 'File Count': self.count, 'Total Size': self.total_size_str, 'Test Time (mins)': overall_elapsed, 'Rate (GB/Hr)': overall_speed } LOG.info( 'Restored %s %s in %s minutes: %s GB/hour.' % (self.total_size, self.size_unit, overall_elapsed, overall_speed)) self.dg.remove_data(restore_dir, force=True) return True
def chd_dump(self, node, chd_file): ''' Run chddump script to dump stripe content. @node: Specifies the node, such as 0.0. @chd_file: The file path of chd stripe. ''' eval_cmd = 'eval `ssh-agent`;eval `ssh-add ~/.ssh/dpnid`;' ssn_cmd = "ssn %s '%s %s'" % (node, self.remote_script, chd_file) cmd = eval_cmd + ssn_cmd result = self.ssh.run_command(cmd) if result[2]: LOG.error('Run chddump failed.') return False return result[0]
def remove_checkpoint(self, cp_name, force=False): ''' Remove checkpoint,call command avmaint --ava rmcp [ --risklosingallbackups ] @cp_name: String;The check point name to be removed @force: Bool;Whether to Remove check point forcibly @output: Bool; ''' try: cp = AvCheckPoint(self.ssh) result = cp.remove_checkpoint(cp_name, force) except Exception, e: LOG.error("%s: Failed to remove check points" % (self.__classname, str(e))) return False
def _get_all_status(self): ''' Private function. Get the all of check point information. call command "avmaint lscp" @output: False or List; Wrapper of all the checkpoint information ''' self.__cpobjs = [] try: cp = AvCheckPoint(self.ssh) cpobjs = cp.lscp() except Exception, e: LOG.error("%s: Failed to get all check points" % (self.__classname, str(e))) return False
def wipe_workdir(self, directory): ''' Wipe work directory on remote server. @directory: Directory path. ''' cmd = 'mv %s %s_%s; mkdir -p %s' % (directory, directory, self.ts, directory) # cmd = 'rm -rf %s/*' % directory result = self.server_ssh.run_command(cmd) if result[2] != 0: LOG.error('Failed to wipe work directory: %s' % directory) return False LOG.info('Wiped work directory: %s' % directory) return True
def get_memory_total(self): ''' Get total memory size from /proc/meminfo, convert unit to GB ''' cmd = "cat /proc/meminfo|grep MemTotal|awk '{print $2}'" result = self.ssh.run_command(cmd) if result[2]: LOG.error('Failed with return code:%s, stdout:[%s], stderr:[%s]' % (result[2], result[0], result[1])) return -1 else: size = float(result[0]) / 1024 / 1024 LOG.info('Total memory size: %s' % size) return size
def get_conf(self): ''' Return the content of akm.xml. ''' self.conf_path = '/usr/local/avamar/etc/akm/akm.xml' result = self.ssh.run_command('cat %s' % self.conf_path) root = ET.fromstring(result[0].strip()) if root.tag != 'AkmConfig': LOG.error('The root tag name is %s, not AkmConfig.' % root.tag) raise Exception('Wrong format of akm.xml.') if root.getchildren(): LOG.error('The AkmConfig node has child nodes.') raise Exception('Wrong format of akm.xml.') self.akm_conf = root.attrib return self.akm_conf
def perf_backup(self): ''' Fill the Avamar server with some data and calculate backup rate. ''' elapsed_minutes = [] for index in range(self.count): data_set = '/home/admin/data_set/backup' log_file = 'avtar_backup_%s%s_%s.log' % (self.size, self.size_unit, index) log_file = os.path.join(self.log_dir, log_file) options = '--logfile=%s --nocache=true' % log_file if self.ddr: options += ' --ddr --ddr-index=%s' % self.ddr_index self.dg.generate_file(data_set, self.size, self.size_unit, 1) res = self.avtar.create_backup(self.server, self.root_user, self.root_password, self.account, data_set, options=options) self.dg.remove_data(data_set) if res != 0: LOG.error('Backup file failed.') return False elapsed = self.analyze_log(self.client_ssh, log_file) if elapsed is None: LOG.error('Can not get elapsed time from log file.') return False speed = self.calculate_speed(self.size, elapsed) LOG.info('[%s] Backed-up %s %s in %s minutes: %s GB/hour.' % (index + 1, self.size, self.size_unit, elapsed, speed)) elapsed_minutes.append(elapsed) overall_elapsed = sum(elapsed_minutes) overall_speed = self.calculate_speed(self.total_size, overall_elapsed) self.backup_elapsed = overall_elapsed self.backup_speed = overall_speed self.summary['Backup'] = { 'Type': 'Backup', 'File Size': self.size_str, 'File Count': self.count, 'Total Size': self.total_size_str, 'Test Time (mins)': overall_elapsed, 'Rate (GB/Hr)': overall_speed } LOG.info( 'Backed-up %s %s in %s minutes: %s GB/hour.' % (self.total_size, self.size_unit, overall_elapsed, overall_speed)) return True
class AvCP(AvServer): ''' Class for wrap checkpoint related commands ''' def __init__(self, ssh=None): super(self.__class__, self).__init__(ssh) self.ssh = ssh self.__classname = self.__class__.__name__ def get_latest_cp_attribute(self, attribute): ''' Get the lastest check point attribute. call command "avmaint cpstatus" @attribute: String,The attribute to be searched @output: String,The value of the attribute ''' try: cp = AvCheckPoint(self.ssh) cpstatus = cp.cpstatus() except Exception, e: LOG.error("%s: Failed to get cpstatus:%s" % (self.__classname, str(e))) return False for name, value in vars(cpstatus).items(): #print "name=%s value=%s" %(name,value) if (name == attribute): return value LOG.error("Invalid attribute name for check point :%s" % (attribute)) return False
def get_taskgroup_status(self): ''' Get taskgroup status. @output: String; "Running","Completed","Failed" ''' try: status = 'Completed' for index in xrange(len(self.__tasks)): task_dict = self.__tasks[index] p = task_dict['process_objs'] if p.is_alive() == True: return 'Running' elif p.exitcode != 0: status = 'Failed' except Exception, e: LOG.error("%s:is_taskgroup_success failed with exception :%s" % (self.__classname, str(e))) return ''
def new_task(self): ''' Generate a new task object @output: the new task object ''' try: task_obj = Task() #{'task_obj','process_objs','status','exit_code'},.. task_dict = { 'task_obj': task_obj, 'process_objs': None, 'status': None, 'exit_code': None } self.__tasks.append(task_dict) except Exception, e: LOG.error("%s:new_task failed with exception" % (self.__classname)) LOG.error(str(e)) return False
def get_node_list(self): ''' Get node list of the server. avmaint ping --xmlperline=5 | egrep "<nodelist" | grep ONLINE <nodelist id="0.2" state="ONLINE" count="54"> <nodelist id="0.1" state="ONLINE" count="49"> <nodelist id="0.0" state="ONLINE" count="58"> ''' cmd = 'avmaint ping --xmlperline=5 | egrep "<nodelist" | grep ONLINE' result = self.ssh.run_command(cmd) if result[2]: LOG.error('Fail to get node list.') raise ExecutionFailed('Fail to get node list.') for line in result[0].split('\n'): m = re.search(r'id="(\d\.\d{1,2})"', line) if m: self.node_list.append(m.group(1)) if len(self.node_list) > 1: self.is_multi = True
def create_checkpoint(self, wait_to_completed=True, timeout=36000, interval=30): ''' Create checkpoint,call command avmaint --ava checkpoint @wait_to_completed: Bool;Wait check point to be completed @timeout: Integer;Seconds to be wait for completion @interval: Integer;Seconds for each try @output: String or False; The attribute value of the specific check point ''' #LOG.info("create_checkpoint, wait_to_completed=%s" %(wait_to_completed)) try: cp = AvCheckPoint(self.ssh) cp_name = cp.create_checkpoint(wait_to_completed, timeout, interval) except Exception, e: LOG.error("%s: Failed to create check points:%s" % (self.__classname, str(e))) return False
def wait_task_group(self): ''' Wait for all the tasks to be completed in this group @output: Boolean ''' try: for index in xrange(len(self.__tasks)): task_dict = self.__tasks[index] p = task_dict['process_objs'] p.join() task_dict['status'] = 'completed' task_dict['exit_code'] = p.exitcode self.__tasks[index] = task_dict LOG.info("task[%d] completed with exit_code=%d" % (index, task_dict['exit_code'])) LOG.log_background_messages(p.pid) return True except Exception, e: LOG.error("%s:wait_task_group failed with exception" % (self.__classname)) LOG.error(str(e)) return False
def __add_task_op(self, task_obj, op, object_name, *args, **kwargs): ''' Internal function, add operation for one task @task_obj: the object of the specified task wich generated by the new_task() @op: String;add_class;add_method;ssh_reconn add_class, add the task to initialize the class and generate the instance add_method,add the task to call the method ssh_reconn,reconnect the ssh connection @object_name:String or object while op is add_class means the class name while op is add_method means the method name while op is ssh_reconn means the ssh object @args: the arguments @kwargs: the kw arguments @output: Boolean ''' try: result = task_obj.add_ops(op, object_name, *args, **kwargs) except Exception, e: LOG.error("%s:wait_task_group failed with exception" % (self.__classname)) LOG.error(str(e)) return False
def perf_check_account(self, account): ''' Check for client account used for avtar, create account if not exist. @account: Specifies a hierarchical LOCATION on the Avamar server. If use avtar on Avamar server, need create an account. Otherwise, account will be created once the client registered, should be domain plus client name. ''' ret = 1 self.account = account cmd = 'avmgr getu --account=%s' % account result = self.server_ssh.run_command(cmd) LOG.info('Get account result: %s' % result) # Creating client account if not exist if result[2] != 0: domains = filter(None, account.split('/')) for domain in domains[:-1]: cmd = 'avmgr newd --account=%s' % domain result = self.server_ssh.run_command(cmd) # User or account already exists if result[2] == 6: continue elif result[2] != 0: LOG.error('Create account failed, %s' % result) return False cmd = 'avmgr newm --account=%s' % account result = self.server_ssh.run_command(cmd) LOG.info('Create account result: %s' % result) ret = result[2] # Remove the existing data for the client account else: ret = self.avtar.delete_all_backups(self.server, self.root_user, self.root_password, self.account) return False if ret else True
def execute_group(self): ''' Execute all the tasks in this group @output: Boolean ''' try: for index in xrange(len(self.__tasks)): LOG.info("Executing task[%d]" % (index)) task_dict = self.__tasks[index] task_obj = task_dict['task_obj'] process_obj = self.__execute_task_back(task_obj) task_dict = { 'task_obj': task_obj, 'process_objs': process_obj, 'status': 'running', 'exit_code': 255 } self.__tasks[index] = task_dict return True except Exception, e: LOG.error("%s:execute_group failed with exception" % (self.__classname)) LOG.error(str(e)) return False
def scp_debug_script(self): ''' Copy debug script to Avamar server. ''' self.get_node_list() # Copy debug script to utility node first self.ssh.directory_should_exist(self.remote_dir) self.ssh.put_file(self.chd_dump_script, destination=self.remote_dir, mode='0755') # If multi-node, copy debug script to each data node eval_cmd = 'eval `ssh-agent`;eval `ssh-add ~/.ssh/dpnid`;' if self.is_multi: for node in self.node_list: scn_cmd = 'scn %s %s:%s' % (self.remote_script, node, self.remote_dir) cmd = eval_cmd + scn_cmd result = self.ssh.run_command(cmd) if result[2]: LOG.error('Falied to copy debug script to node %s' % node) return False LOG.info('Copy debug script to all nodes.') return True
def execute_ops(self): ''' Execute the operation list in the task,called by execute_task @output: The output returned by the last method ''' result = False try: for index in xrange(len(self.__ops)): oplist = self.__ops[index] (op, object_name, args, kwargs, l_instance, l_result) = oplist result = False if op == "ssh_reconn": LOG.info("Executing task: ssh reconnecting") ssh = object_name ssh.login() cls_instance = ssh result = True elif op == "add_class": LOG.info("Executing task: init_call_class :%s" % (object_name)) cls_instance = self.init_call_class( object_name, *args, **kwargs) if cls_instance: result = True else: LOG.info("Executing task: call mothod :%s" % (object_name)) cls_instance = self.__current_instance result = self.call_instance_method(cls_instance, object_name, *args, **kwargs) result_type = type(result) if result_type is IntType and result == 0: result = True if not result: LOG.error("%s:Execute task %s %s failed " % (self.__classname, op, object_name)) return False u_oplist = (op, object_name, args, kwargs, cls_instance, result) self.__ops[index] = u_oplist except Exception, e: LOG.error("%s:Execute execute_ops failed with exception" % (self.__classname)) LOG.error(str(e))
def is_taskgroup_success(self): ''' Check whether all the tasks are completed successfully @output: Boolean ''' try: for index in xrange(len(self.__tasks)): task_dict = self.__tasks[index] #LOG.info("task[%d] status=%s exit_code=%d" %(index,task_dict['status'],task_dict['exit_code'])) if task_dict['status'] != 'completed' or task_dict[ 'exit_code'] != 0: LOG.error("task[%d] completed with exit_code=%d" % (index, task_dict['exit_code'])) return False except Exception, e: LOG.error("%s:is_taskgroup_success failed with exception" % (self.__classname)) LOG.error(str(e)) return False
def perf_replicate(self, dst_server, dst_user, dst_password): ''' Run basic replicate to destination Avamar server and calculate speed. avrepl --operation=replicate --hfsaddr=src_addr --id=root --ap=Chang3M3Now. --dstaddr=dst_addr --dstid=repluser --dstap=Chang3M3Now. --dpnname=src_addr --[replscript]dstencrypt=tls --logfile=/tmp/replicate.log account 2>&1 @dst_server: Destination server. @dst_user: Destination server repl user. @dst_password: Destination server repl user password. ''' log_file = os.path.join(self.log_dir, 'replicate.log') cmd = 'avrepl --operation=replicate --hfsaddr=%s --id=%s ' \ '--ap=%s --dstaddr=%s --dstid=%s --dstap=%s --dpnname=%s ' \ '--[replscript]dstencrypt=tls %s > %s 2>&1' \ % (self.server, self.root_user, self.root_password, dst_server, dst_user, dst_password, self.server, self.account, log_file) result = self.server_ssh.run_command(cmd) if result[2] != 0: LOG.error('Replicate to %s failed, please check log file %s.' % (dst_server, log_file)) LOG.error('Return result: %s' % str(result)) return False elapsed = self.analyze_log(self.server_ssh, log_file) if elapsed is None: LOG.error('Can not get elapsed time from log file.') return False speed = self.calculate_speed(self.total_size, elapsed) self.replicate_elapsed = elapsed self.replicate_speed = speed self.summary['Replicate'] = { 'Type': 'Replicate', 'File Size': self.size_str, 'File Count': self.count, 'Total Size': self.total_size_str, 'Test Time (mins)': elapsed, 'Rate (GB/Hr)': speed } LOG.info('Replicated %s %s in %s minutes: %s GB/hour.' % (self.total_size, self.size_unit, elapsed, speed)) return True
def call_instance_method(self, instance, method, *args, **kwargs): ''' Call the method specified by install and method name with the specified arguments @instance: The instance object @method: String ,The method name in the instance object @output: The output returned by the method ''' clsname = self.__current_classname result = False try: func = getattr(instance, method, False) if func: result = func(*args, **kwargs) else: LOG.error("No method:%s for class:%s" % (method, clsname)) LOG.info("Start method successfuly:%s, result=%s" % (method, result)) except Exception, e: LOG.error("%s:Execute %s.%s %s %s failed with exception" \ %(self.__classname,clsname,method,args,kwargs)) LOG.error(str(e)) return False
def overlap_command_running(cmdlist, cmd1, ssh): ''' This function is used to run several commands at the same time on server @cmslist: command list which are running in the background by using start_command @cmd1: command are executed by using execute_command @ssh: server connection ''' stdout_result, stderr_result, stdrc_result = ssh.run_overlap_command( cmdlist, cmd1) for i in range(0, len(stdrc_result)): if stdrc_result[i] == 0: if 'ERROR' not in stdout_result[i]: if 'ERROR' not in stderr_result[i]: pass else: LOG.error(stderr_result[i]) return False else: LOG.error(stderr_result[i]) return False else: LOG.error('Command return code is %s' % stdrc_result[i]) return False return True
def run_acp_test(self, email, project, configuration, start_seed, end_seed): ''' Run an ACP test. @email: Engineer email address. @project: Project Name. @configuration: The test configuration that was run. @start_seed: The *first* backup seed used to fill each node. @end_seed: The *final* backup seed used to fill each node. ''' # 1. Take a checkpiont and note the cp tag avcp = AvCP(self.server_ssh) cp_name = avcp.create_checkpoint() all_cp_name = avcp.get_all_cp_name() if cp_name is False: LOG.error('Take checkpoint failed') return False LOG.info('Take a checkpoint %s' % cp_name) # 2. Stop Gsan avdpn = AvDpn(self.server_ssh) if avdpn.shutdown_dpn() is False: LOG.error('Shutdown gsan failed') return False LOG.info('Gsan stopped') # 3. Delete the other checkpiont and Cur data if self.mapall_root("rm -rf /data0?/cur") != 0: LOG.error('Remove cur data failed') return False LOG.info('Removed cur data') for tag in all_cp_name: if tag != cp_name: LOG.info('Remove checkpoint %s' % tag) if self.mapall_root("rm -rf /data0?/%s" % tag) != 0: LOG.error('Remove other checkpoints failed') return False LOG.info('Removed all other checkpoints') # 4. Roll back to checkpoint "rollback.dpn --nocheck --cptag=cp..." if not avdpn.rollback_dpn(cp_name, options='--nocheck'): LOG.error('Rollback to checkpoint %s failed' % cp_name) return False LOG.info('Rollback to checkpoint %s' % cp_name) # 5. Run the acp scripts "acp.pl" LOG.info('Start to run ACP test') self.wipe_workdir('/home/admin/work') script = os.path.join(self.remote_dir, 'acp-1.15', 'acp.pl') cmd = 'eval `ssh-agent`; eval `ssh-add /home/admin/.ssh/dpnid`; perl %s' % script self.server_ssh.login() self.server_ssh.write(cmd) self.server_ssh.read_until_regexp( 'Would you like to change the repository directory') self.server_ssh.write('y') self.server_ssh.read_until_regexp( 'Enter the repository directory to use') self.server_ssh.write('/home/admin/results-archive') self.server_ssh.read_until_regexp('Enter Engineer email address') self.server_ssh.write(email) self.server_ssh.read_until_regexp('Enter the Project Name') self.server_ssh.write(project) self.server_ssh.read_until_regexp( 'Enter one line describing the test configuration that was run') self.server_ssh.write(configuration) self.server_ssh.read_until_regexp( 'Please enter the \\*first\\* backup seed used to fill each node') self.server_ssh.write(str(start_seed)) self.server_ssh.read_until_regexp( 'Please enter the \\*final\\* backup seed used to fill each node') self.server_ssh.write(str(end_seed)) self.server_ssh.read_until_regexp('Would you like to begin the test') self.server_ssh.write('y') self.monitor_ssh = SSHConnection(self.server, self.admin_user, self.admin_password, 22, 120, '') self.monitor_ssh.login() timeout = 608400 elapsed = 0 while elapsed < timeout: result = self.monitor_ssh.run_command( 'ps -ef | grep "acp\\.pl" | grep -v grep') if result[2] != 0: LOG.info('ACP test finished') break LOG.info('ACP test is running') LOG.info(self.server_ssh.read()) time.sleep(300) elapsed += 300 result = self.server_ssh.read() LOG.info('ACP test result: %s' % result) if 'ACP Test complete' in result: return True else: return False