def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running do_gpcheckcat") self.dbstate.check_catalog() (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid") tinctest.logger.info(gp_seg_conf) return True
def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.util = Filerepe2e_Util() self.gpconfig = GpConfig() self.config = GPDBConfig() self.gpr = GpRecover(self.config) self.dbstate = DbStateClass('run_validation', self.config) self.gpstart = GpStart() self.gpstop = GpStop() super(FilerepTestCase, self).__init__(methodName)
def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running do_gpcheckcat") self.dbstate.check_catalog() return True
def __init__(self,methodName): self.fileutil = Filerepe2e_Util() self.config = GPDBConfig() self.gprecover = GpRecover(self.config) self.gpstart = GpStart() self.gpstop = GpStop() self.gpfile = Gpfilespace(self.config) self.dbstate = DbStateClass('run_validation', self.config) self.port = os.getenv('PGPORT') self.base = GPDBStorageBaseTestCase() super(SuspendCheckpointCrashRecovery,self).__init__(methodName)
def _validation(self): ''' @summary :gpcheckcat and gpcheckmirrorintegrity ''' ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;") ###sleep(30) # sleep for some time for the segments to be in sync before validation self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running _validation") outfile = local_path("subt_checkcat.out") self.dbstate.check_catalog(outputFile=outfile) self.dbstate.check_mirrorintegrity()
def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.util = Filerepe2e_Util() self.gpconfig = GpConfig() self.config = GPDBConfig() self.gpr = GpRecover(self.config) self.dbstate = DbStateClass('run_validation',self.config) self.gpstart = GpStart() self.gpstop = GpStop() super(FilerepTestCase,self).__init__(methodName)
def __init__(self, methodName): self.filereputil = Filerepe2e_Util() self.config = GPDBConfig() self.gprecover = GpRecover(self.config) self.gpstop = GpStop() self.gpstart = GpStart() self.gpverify = GpdbVerify(config=self.config) self.dbstate = DbStateClass('run_validation', self.config) self.port = os.getenv('PGPORT') super(PgtwoPhaseClass, self).__init__(methodName)
def validate_test_CatalogCheck(self, action,storage): file_name =action+'_'+storage out_file = self.base_dir+ "/sql/"+file_name+'.out' ans_file = self.base_dir+ "/expected/"+file_name+'.ans' tinctest.logger.info( 'out-file == %s \n' % out_file) tinctest.logger.info( 'ans-file == %s \n' % ans_file) # Validate Ans file self.validate_sql(ans_file,out_file) if storage == 'multisegfiles': ''' check if multi_segfile_tab file has multiple segfiles per column ''' tablename='multi_segfile_tab' relid = self.get_relid(file_name=tablename ) utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid) u_port=utilitymodeinfo[0] u_host=utilitymodeinfo[1] assert(1 < int(self.get_segment_cnt(relid=relid,host=u_host,port= u_port))) # Check Correctness of the catalog self.dbstate = DbStateClass('run_validation') outfile = local_path("gpcheckcat_"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')+".out") self.dbstate.check_catalog(outputFile=outfile)
def __init__(self, config=None): if config is not None: self.config = config else: self.config = GPDBConfig() self.filereputil = Filerepe2e_Util() self.gprecover = GpRecover(self.config) self.gpstop = GpStop() self.gpstart = GpStart() self.gpverify = GpdbVerify(config=self.config) self.dbstate = DbStateClass('run_validation', self.config) self.port = os.getenv('PGPORT')
def run_validation(self): tinctest.logger.info('Veriy the integrity between primary and mirror ...') self.dbstate = DbStateClass('run_validation') self.dbstate.check_mirrorintegrity()
class SubTransactionLimitRemovalTestCase(MPPTestCase): def __init__(self, methodName): super(SubTransactionLimitRemovalTestCase, self).__init__(methodName) def check_system(self): ''' @summary: Check whether the system is up and sync. Exit out if not ''' tinctest.logger.info("[STLRTest] Running check_system") tinctest.logger.info( "[STLRTest] Check whether the system is up and sync") cmd = "select count(*) from gp_segment_configuration where content<> -1 ;" (num_cl) = PSQL.run_sql_command(cmd) count_all = num_cl.split('\n')[3].strip() cmd = "select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';" (num_cl) = PSQL.run_sql_command(cmd) count_up_and_sync = num_cl.split('\n')[3].strip() tinctest.logger.info("[STLRTest] printing gp segment configuration") (gp_seg_conf) = PSQL.run_sql_command( "select * from gp_segment_configuration order by dbid") tinctest.logger.info(gp_seg_conf) if count_all != count_up_and_sync: raise Exception( "[STLRTest] System not in sync and up. Exiting test") else: tinctest.logger.info( "[STLRTest] Starting New Test: System is up and in sync...") def run_sqls(self, test): ''' @summary : Run the sql @param test: the sql file list ''' tinctest.logger.info("[STLRTest] Running run_sqls") tinctest.logger.info("[STLRTest]Starting new thread to run sql %s" % (test)) PSQL.run_sql_file(local_path(test)) def suspend_faults(self, fault_name): ''' @summary : Suspend the specified fault: reset it before issuing suspend @param fault_name : Name of the fault to suspend ''' tinctest.logger.info("[STLRTest] Running suspend_faults") self.util = Filerepe2e_Util() (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the %s fault" % (fault_name)) (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='suspend', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done suspending the %s fault" % (fault_name)) def check_fault_status(self, fault_name=None, status=None, max_cycle=10): ''' Check whether a fault is triggered. Poll till the fault is triggered @param name : Fault name @param status : Status to be checked - triggered/completed ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running check_fault_status %s", status) if (not fault_name) or (not status): raise Exception( "[STLRTest]Need a value for fault_name and status to continue") poll = 0 while (poll < max_cycle): (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='status', r='primary', H='ALL') poll += 1 for line in out1.splitlines(): if line.find(fault_name) > 0 and line.find(status) > 0: tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name, status)) poll = 0 tinctest.logger.info( "[STLRTest] Running check_fault_status %s TRUE", status) return True #sleep a while before start polling again sleep(10) tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status) return False def filerep_fault(self, trans_state): ''' @summary : Inject the filerep fault supplied @param trans_state : type of transition ''' tinctest.logger.info("[STLRTest] Running filerep_fault") self.util = Filerepe2e_Util() if trans_state == 'failover_to_primary': tinctest.logger.info("[STLRTest] primary failover") (ok1, out1) = self.util.inject_fault(f='filerep_consumer', m='async', y='fault', r='mirror', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done primary failover fault") elif trans_state == 'failover_to_mirror': tinctest.logger.info("[STLRTest] fault for postmaster panic") (ok1, out1) = self.util.inject_fault(f='postmaster', m='async', y='panic', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done postmaster panic fault") elif trans_state == 'postmaster_reset': tinctest.logger.info("[STLRTest] fault for filerep_sender panic") (ok1, out1) = self.util.inject_fault(f='filerep_sender', m='async', y='panic', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_sender panic fault") tinctest.logger.info("[STLRTest] Done Injecting Fault") def resume_faults(self, fault_name, trans_state): '''' @summary : Resume the fault and check status ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_faults") if not trans_state == 'failover_to_mirror': tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name) (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='resume', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault resume failed") tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name) if trans_state == 'postmaster_reset': (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='resume', r='mirror', H='ALL') if not ok1: tinctest.logger.info( "[STLRTest]Failed fault for %s resume on mirror" % fault_name) if trans_state == 'failover_to_primary': self.check_fault_status(fault_name, 'completed') def checkPSQLRun(self, test): '''Check if the psql run started in parallel is over before running the _post.sql ''' tinctest.logger.info("[STLRTest] Running checkPSQLRun") cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql' while (1): is_running = 0 (rc, out) = shell.run(cmd_str) for line in out: if '%s' % test in line: is_running = 1 if is_running == 0: return True else: sleep(10) return False def resume_filerep_resync(self): self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_filerep_resync") tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume") (ok1, out1) = self.util.inject_fault(f='filerep_resync', m='async', y='resume', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info( "[STLRTest]Done fault for failover_to_mirror resume") sleep(10) def stop_start_validate(self, expect_down_segments=False): """ Do gpstop -i, gpstart and see if all segments come back up fine """ self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running stop_start_validate") tinctest.logger.info("[STLRTest]Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate='i') if not expect_down_segments: if not ok: raise Exception( '[STLRTest]Problem while shutting down the cluster') tinctest.logger.info( "[STLRTest]Successfully shutdown the cluster.") tinctest.logger.info("[STLRTest]Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('[STLRTest]Failed to bring the cluster back up') tinctest.logger.info("[STLRTest]Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("[STLRTest]segments were marked down") else: return (True, "All segments are up") def run_gprecoverseg(self, recover_option): ''' @summary : Call gpecoverseg full or incremental to bring back the cluster to sync ''' self.gpr = GpRecover() tinctest.logger.info("[STLRTest] Running run_gprecoverseg") if recover_option == 'full': self.gpr.full() else: self.gpr.incremental() self.gpr.wait_till_insync_transition() def run_restart_database(self): ''' @summary : Restart the database ''' self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running run_restart_database") ok = self.gpstop.run_gpstop_cmd(immediate='i') tinctest.logger.info(ok) ok = self.gpstart.run_gpstart_cmd() tinctest.logger.info(ok) def reset_faults(self, fault_name, current_cluster_state): '''' @summary : Reset the faults at the end of test ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running reset_faults") tinctest.logger.info("[STLRTest] Resetting fault before ending test") (ok1, out1) = self.util.inject_fault(f=fault_name, m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting %s fault" % (fault_name)) if current_cluster_state == 'resync': (ok1, out1) = self.util.inject_fault(f='filerep_resync', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_resync fault") (ok1, out1) = self.util.inject_fault(f='checkpoint', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting checkpoint fault") def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running do_gpcheckcat") self.dbstate.check_catalog() return True def _validation(self): ''' @summary :gpcheckcat and gpcheckmirrorintegrity ''' ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;") ###sleep(30) # sleep for some time for the segments to be in sync before validation self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running _validation") outfile = local_path("subt_checkcat.out") self.dbstate.check_catalog(outputFile=outfile) self.dbstate.check_mirrorintegrity() def inject_and_resume_fault(self, fault_name, trans_state): self.check_fault_status(fault_name, 'triggered') self.filerep_fault(trans_state) if trans_state == 'failover_to_mirror': PSQL.run_sql_file(local_path('test_while_ct.sql')) self.resume_faults(fault_name, trans_state) def run_post_sqls(self, fault_name='', trans_state=''): PSQL.wait_for_database_up() if (trans_state == 'failover_to_primary' or trans_state == ''): post_sql = "failover_sql/subt_create_table_ao_post_commit" else: post_sql = "failover_sql/subt_create_table_ao_post_abort" sql_file = post_sql + ".sql" ans_file = post_sql + ".ans" out_file = post_sql + ".out" PSQL.run_sql_file(sql_file=local_path(sql_file), out_file=local_path(out_file)) diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file)) if not diff_res: self.fail("[STLRTest]Gpdiff failed for : %s %s" % (fault_name, trans_state)) def reset_all_faults(self): '''' @summary : Reset all faults on primary and mirror ''' tinctest.logger.info("[STLRTest] Running reset_all_faults") self.util = Filerepe2e_Util() (ok1, out1) = self.util.inject_fault(f='all', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting all faults on primary") (ok1, out1) = self.util.inject_fault(f='all', m='async', y='reset', r='mirror', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info( "[STLRTest]Done resetting all faults fault on mirror") def kill_zombies(self): ''' @summary : There are stray zombie processes running after each test. This method clears them ''' tinctest.logger.info("[STLRTest] Running kill_zombies") cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1" cmd = Command("shell_command", cmd_str) tinctest.logger.info('Executing command: %s : %s' % ("shell command", cmd_str)) cmd.run() result = cmd.get_results() out = result.stdout lines = out.split('\n') for line in lines: pids = line.split('#') if pids[0] == '1': kill_str = "kill -9 %s" % (pids[1]) cmd2 = Command("kill_command", kill_str) cmd2.run() def skip_checkpoint(self): ''' @summary : Routine to inject fault that skips checkpointing ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running skip_checkpoint") (ok1, out1) = self.util.inject_fault(f='checkpoint', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault") (ok1, out1) = self.util.inject_fault(f='checkpoint', m='async', y='skip', r='primary', H='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault") def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs = Gpfilespace() gpfs.create_filespace('subt_filespace_a') def cleandb(self): db = Database() db.setupDatabase('gptest')
def test_gpcheckcat(self): dbstate = DbStateClass('run_validation') dbstate.check_catalog(alldb = False, dbname = Verification.dbname)
def test_verify_catalog(self): dbstate = DbStateClass('run_validation') dbstate.check_catalog(alldb=False, dbname=catalog_consistency.dbname)
def test_mpp23395(self): """ @description Test MPP-20964, uncleaned lock table by pg_terminate_backend @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99] """ self.util = Filerepe2e_Util() (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared") # setup PSQL.run_sql_command(""" DROP TABLE IF EXISTS mpp23395; """) # Scenario 1: FAULT during Create Table on master sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1); # Scenario 2: FAULT during Drop Table on master, COMMIT case sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared") # Scenario 3: FAULT during Create Table on segment, COMMIT case sql = ''' SET dtx_phase2_retry_count = 1; SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2); # Scenario 4: FAULT during Drop Table on segment, COMMIT case sql = ''' SET dtx_phase2_retry_count = 1; SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; DROP TABLE mpp23395; ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2); # Scenario 5: FAULT during Create Table on master, ABORT case (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1); if not ok: raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared") sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared") PSQL.run_sql_command(""" CREATE TABLE mpp23395(a int); """) # Scenario 6: FAULT during Drop Table on master, ABORT case (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1); if not ok: raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared") sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared") PSQL.run_sql_command(""" DROP TABLE mpp23395; """) # Scenario 7: FAULT during Create Table on segment, COMMIT case, succeeds on second retry sql = ''' DROP TABLE IF EXISTS mpp23395; SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2, False); # QE panics after writing prepare xlog record. This should cause # master to broadcast abort but QEs handle the abort in # DTX_CONTEXT_LOCAL_ONLY context. sql = ''' DROP TABLE IF EXISTS mpp23395; CREATE TABLE mpp23395(a int); INSERT INTO mpp23395 VALUES(1), (2), (3); BEGIN; SET debug_abort_after_segment_prepared = true; DELETE FROM mpp23395; COMMIT; ''' # No prepared transactions should remain lingering PSQL.run_sql_command(sql) self.check_no_dangling_prepared_transaction() dbstate = DbStateClass('run_validation') dbstate.check_catalog()
class FilerepTestCase(MPPTestCase): def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.util = Filerepe2e_Util() self.gpconfig = GpConfig() self.config = GPDBConfig() self.gpr = GpRecover(self.config) self.dbstate = DbStateClass('run_validation', self.config) self.gpstart = GpStart() self.gpstop = GpStop() super(FilerepTestCase, self).__init__(methodName) def sleep(self, seconds=60): time.sleep(seconds) def create_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def remove_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def get_timestamp_of_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command( 'check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) res = cmd.get_results().stdout.strip() return res def verify_file_exists(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def handle_ext_cases(self, file): """ @file: wet sql file to replace with specific machine env. """ host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP querystring = "gpfdist://" + host + ":8088" if os.path.isfile(file): for line in fileinput.FileInput(file, inplace=1): line = re.sub('gpfdist.+8088', querystring, line) print str(re.sub('\n', '', line)) def handle_hybrid_part_cases(self, file): """ @file: hybrid sql file to replace with specific machine env """ querystring = "FROM '" + local_path('hybrid_part.data') + "'" if os.path.isfile(file): for line in fileinput.FileInput(file, inplace=1): line = re.sub('FROM\s\'.+hybrid_part.data\'', querystring, line) print str(re.sub('\n', '', line)) def preprocess(self): """ Replace the hard-coded information from sql files with correct hostname and ip address,etc """ list_workload_dir = [ 'set_sync1', 'sync1', 'set_ck_sync1', 'ck_sync1', 'set_ct', 'ct', 'set_resync', 'resync', 'set_sync2', 'sync2' ] for dir in list_workload_dir: sql_path = os.path.join(local_path(dir), 'sql') ans_path = os.path.join(local_path(dir), 'expected') for file in os.listdir(sql_path): if (file.find('wet_ret') >= 0): self.handle_ext_cases(os.path.join(sql_path, file)) if (file.find('hybrid_part') >= 0): self.handle_hybrid_part_cases(os.path.join(sql_path, file)) for file in os.listdir(ans_path): if (file.find('wet_ret') >= 0): self.handle_ext_cases(os.path.join(ans_path, file)) if (file.find('hybrid_part') >= 0): self.handle_hybrid_part_cases(os.path.join(ans_path, file)) def clean_data(self): """ Clean the data by removing the external table, otherwise, more data will be appended to the same external table from running multiple sql files. """ test = local_path("") test = str(test) + "data/*.*" cmd = 'rm -rfv ' + test run_shell_command(cmd) def anydownsegments(self): """ checks if any segments are down """ tinctest.logger.info("Checking if any segments are down") num_segments_down = self.count_of_nodes_down() if int(num_segments_down) == 0: return True else: return False def stop_start_validate(self, stopValidate=True): """ Do gpstop -i, gpstart and see if all segments come back up fine """ tinctest.logger.info("Performing stop start validate") tinctest.logger.info("Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate='i', validate=stopValidate) if not ok and stopValidate: raise Exception('Problem while shutting down the cluster') tinctest.logger.info("Successfully shutdown the cluster.") tinctest.logger.info("Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failed to bring the cluster back up') tinctest.logger.info("Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("segments were marked down") else: return (True, "All segments are up") def method_reset_fault_injection(self): """ Resets fault injection Return: (True, [result]) if OK, or (False, [result]) otherwise """ tinctest.logger.info("Resetting fault injection") (ok1, out1) = self.util.inject_fault(f='filerep_resync', m='async', y='reset', r='primary', H='ALL') if not ok1: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to reset resync") return (True, str(out1)) def method_resume_filerep_resync(self): """ Resumes the process of resync """ tinctest.logger.info("Resuming Resync") (ok, out) = self.util.inject_fault(f='filerep_resync', m='async', y='resume', r='primary', H='ALL') if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done resuming resync") return (ok, out) def run_method_suspendresync(self): """ Stops the cluster from going to resync """ tinctest.logger.info("Suspending resync") (ok, out) = self.util.inject_fault(f='filerep_resync', m='async', y='suspend', r='primary', H='ALL') tinctest.logger.info('output from suspend resync %s' % out) if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to suspend resync") return (ok, out) def count_of_masters(self): """ Gives count of number of nodes in the cluster that are master Return: count of number of nodes in the cluster that are master """ tinctest.logger.info("Count the number of masters") cmd = "select count(*) from gp_segment_configuration where content = -1" (out) = PSQL.run_sql_command(cmd) num_master = out.split('\n')[3].strip() return num_master def count_of_nodes(self): """ Gives count of number of nodes in the cluster Return: count of number of nodes in the cluster """ tinctest.logger.info("Counting number of nodes") cmd = "select count(*) from gp_segment_configuration" (num_cl) = PSQL.run_sql_command(cmd) total_num_rows = num_cl.split('\n')[3].strip() return total_num_rows def count_of_nodes_in_ct(self): """ Gives count of number of nodes in change tracking Return: count of number of nodes in change tracking """ tinctest.logger.info("Counting number of nodes in ct") sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'" (num_cl) = PSQL.run_sql_command(sqlcmd) num_cl = num_cl.split('\n')[3].strip() return num_cl def count_of_nodes_down(self): """ Gives count of number of nodes marked as down Return: count of number of nodes marked as down """ tinctest.logger.info("Counting the number of nodes down") sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'" (num_down) = PSQL.run_sql_command(sqlcmd) num_down = num_down.split('\n')[3].strip() return num_down def count_of_nodes_sync(self): """ Gives count of number of nodes in sync Return: count of number of nodes in sync """ tinctest.logger.info("Counting the number of nodes in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def count_of_nodes_not_sync(self): """ Gives count of number of nodes not in sync Return: count of number of nodes not in sync """ tinctest.logger.info("Counting number of nodes not in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def inject_fault_on_first_primary(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ tinctest.logger.info("\n Injecting faults on first primary") (ok, out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async', y='infinite_loop', r='primary', seg_id=2, sleeptime=300) if not ok: raise Exception( "Fault filerep_immediate_shutdown_request injection failed") (ok, out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async', y='infinite_loop', r='primary', seg_id=2) if not ok: raise Exception( "Fault fileRep_is_operation_completed injection failed") tinctest.logger.info("\n Done Injecting Fault") def inject_fault_on_first_mirror(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'" (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd) first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip() tinctest.logger.info("\n Injecting faults on first mirror") flag = self.util.check_fault_status( fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100) if not flag: raise Exception( "Fault fileRep_is_operation_completed didn't trigger") (ok, out) = self.util.inject_fault(f='filerep_consumer', m='async', y='panic', r='mirror', seg_id=first_mirror_dbid) if not ok: raise Exception("Fault filerep_consumer injection failed") tinctest.logger.info("\n Done Injecting Fault") def setupGpfdist(self, port, path): gpfdist = Gpfdist(port, self.hostIP()) gpfdist.killGpfdist() gpfdist.startGpfdist(' -t 30 -m 1048576 -d ' + path) return True def cleanupGpfdist(self, port, path): gpfdist = Gpfdist(port, self.hostIP()) gpfdist.killGpfdist() return True def hostIP(self): ok = run_shell_command('which gpfdist') if not ok: raise GPtestError("Error:'which gpfdist' command failed.") hostname = socket.gethostname() if hostname.find('mdw') > 0: host = 'mdw' else: host = str(socket.gethostbyname( socket.gethostname())) #Must be an IP tinctest.logger.info('current host is %s' % host) return host def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs = Gpfilespace() gpfs.create_filespace('filerep_fs_a') gpfs.create_filespace('filerep_fs_b') gpfs.create_filespace('filerep_fs_c') gpfs.create_filespace('filerep_fs_z') gpfs.create_filespace('sync1_fs_1') # Set max_resource_queues to 100 cmd = 'gpconfig -c max_resource_queues -v 100 ' ok = run_shell_command(cmd) if not ok: raise Exception( 'Failure during setting the max_resource_queues value to 100 using gpconfig tool' ) #Restart the cluster self.gpstop.run_gpstop_cmd(immediate='i') ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failure during restarting the cluster') return True def get_ext_table_query_from_gpstate(self): outfile = local_path("gpstate_tmp") ok = run_shell_command("gpstate --printSampleExternalTableSql >" + outfile) querystring = "" flag = 'false' out = open(outfile, 'r').readlines() for line in out: line.strip() if (line.find( 'DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status') >= 0): flag = 'true' if flag == 'true': querystring = querystring + line return querystring ############RUN QYUERY def check_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ if phase == 'sync1': state_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'" ) sync1_num = self.query_select_count( "select count(*) from gp_segment_configuration where content <> -1" ) if int(sync1_num) <> int(state_num): raise Exception("gpstate in Sync state failed") tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) elif phase == 'ct': p_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking' and role = 'Primary' and status_in_config='Up' and instance_status='Up'" ) m_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync' and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' " ) if int(p_num) <> int(m_num): raise Exception("gpstate in CT state failed") tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) elif phase == 'resync_incr': if type == 'primary': query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) else: query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) query_num_rows = "select count(*) from gp_segment_configuration where content <> -1" num_rows = self.query_select_count(query_num_rows) if int(resync_incr_num) <> int(num_rows): tinctest.logger.info("resync_incr_num query run %s" % query) tinctest.logger.info("num_rows query run %s" % query_num_rows) raise Exception( "gpstate in Resync Incremental state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows)) tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) elif phase == 'resync_full': num_rows = self.query_select_count( "select count(*) from gp_segment_configuration where content <> -1" ) if type == 'primary': resync_full_num = self.query_select_count( "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'" ) else: resync_full_num = self.query_select_count( "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'" ) if int(resync_full_num) <> int(num_rows): raise Exception("gptate in Resync Full state failed") tinctest.logger.info("Done Running gpstate in %s phase " % (phase)) return True def trigger_transition(self): PSQL.run_sql_file(local_path('mirrors.sql')) def run_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ tinctest.logger.info("running gpstate") querystring = self.get_ext_table_query_from_gpstate() file1 = local_path('create_table_gpstate.sql') f1 = open(file1, 'w') f1.write(querystring) f1.write('\n') f1.close() PSQL.run_sql_file(local_path('create_table_gpstate.sql')) gpstate_outfile = local_path('gpstate_out') cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile) ok = run_shell_command(cmd) self.check_gpstate(type, phase) return ok def check_mirror_seg(self, master=False): tinctest.logger.info("running check mirror") self.dbstate.check_mirrorintegrity() def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): tinctest.logger.info("running gpcheckcat") self.dbstate.check_catalog(outputFile=outputFile) def query_select_count(self, sqlcmd): (num) = PSQL.run_sql_command(sqlcmd) num = num.split('\n')[3].strip() return num def method_run_failover(self, type): """ Inject fault to failover nodes @type: primary [induces fault in mirror] mirror [creates panic in primary] Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise """ if type == 'primary': tinctest.logger.info("\n primary failover") (ok, out) = self.util.inject_fault(f='filerep_consumer', m='async', y='fault', r='mirror', H='ALL') tinctest.logger.info("\n Done Injecting Fault") elif type == 'mirror': tinctest.logger.info("\n Mirror failover") (ok, out) = self.util.inject_fault(f='postmaster', m='async', y='panic', r='primary', H='ALL') tinctest.logger.info("\n Done Injecting Fault") return True def wait_till_change_tracking_transition(self): self.util.wait_till_change_tracking_transition() def wait_till_insync_transition(self): self.gpr.wait_till_insync_transition() def run_gprecoverseg(self, recover_mode): if recover_mode == 'full': self.gpr.full() else: self.gpr.incremental() def run_gpconfig(self, parameter, master_value, segment_value): if (parameter is not None): self.gpconfig.setParameter(parameter, master_value, segment_value) self.gpstop.run_gpstop_cmd(restart='r') def inject_fault(self, fault=None, mode=None, operation=None, prim_mirr=None, host='All', table=None, database=None, seg_id=None, sleeptime=None, occurence=None): if (fault == None or mode == None or operation == None or prim_mirr == None): raise Exception('Incorrect parameters provided for inject fault') (ok, out) = self.util.inject_fault(f=fault, m=mode, y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
def test_mpp23395(self): """ @description Test MPP-20964, uncleaned lock table by pg_terminate_backend @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99] """ self.util = Filerepe2e_Util() (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared") # setup PSQL.run_sql_command(""" DROP TABLE IF EXISTS mpp23395; """) # Scenario 1: FAULT during Create Table on master sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1); # Scenario 2: FAULT during Drop Table on master, COMMIT case sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared") # Scenario 3: FAULT during Create Table on segment, COMMIT case sql = ''' SET dtx_phase2_retry_count = 1; SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2); # Scenario 4: FAULT during Drop Table on segment, COMMIT case sql = ''' SET dtx_phase2_retry_count = 1; SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; DROP TABLE mpp23395; ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2); # Scenario 5: FAULT during Create Table on master, ABORT case (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1); if not ok: raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared") sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared") PSQL.run_sql_command(""" CREATE TABLE mpp23395(a int); """) # Scenario 6: FAULT during Drop Table on master, ABORT case (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1); if not ok: raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared") sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared") PSQL.run_sql_command(""" DROP TABLE mpp23395; """) # Scenario 7: FAULT during Create Table on segment, COMMIT case, succeeds on second retry sql = ''' DROP TABLE IF EXISTS mpp23395; SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'finish_prepared_after_record_commit_prepared', 'error', 2, False); # Scenario 8: QE panics after writing prepare xlog record. This should # cause master to broadcast abort but QEs handle the abort in # DTX_CONTEXT_LOCAL_ONLY context. sql = ''' DROP TABLE IF EXISTS mpp23395; CREATE TABLE mpp23395(a int); INSERT INTO mpp23395 VALUES(1), (2), (3); SET debug_abort_after_segment_prepared = true; DELETE FROM mpp23395; ''' # No prepared transactions should remain lingering PSQL.run_sql_command(sql) self.check_no_dangling_prepared_transaction() dbstate = DbStateClass('run_validation') dbstate.check_catalog()
class FtsTransitions(MPPTestCase): def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.fileutil = Filerepe2e_Util() self.gpconfig = GPDBConfig() self.gprecover = GpRecover(self.gpconfig) self.gpstate = Gpstate() self.gpprimarymirror = Gpprimarymirror() self.base = GPDBStorageBaseTestCase(self.gpconfig) super(FtsTransitions,self).__init__(methodName) def kill_first_mirror(self): mirror_data_loc = self.get_default_fs_loc(role='m',content=0) (host, port) = self.gpconfig.get_hostandport_of_segment(psegmentNumber = 0, pRole = 'm') cmdString = 'ps -ef|grep -v grep|grep \'%s\'|awk \'{print $2}\'|xargs kill -9'%mirror_data_loc remote = Command(name ='kill first mirror', cmdStr = cmdString, ctxt=2, remoteHost=host) remote.run() tinctest.logger.info('run command %s'%cmdString) rc = remote.get_results().rc result = remote.get_results().stdout tinctest.logger.info('Command returning, rc: %s, result: %s'%(rc,result)) def kill_master_process(self, ProcName=None): cmdString = 'ps -ef|grep postgres| grep %s | grep \'%s\'| awk \'{print $2}\'|xargs kill -9'%(self.pgport,ProcName) cmd = Command('kill process on master', cmdStr = cmdString) cmd.run() tinctest.logger.info('run command %s'%cmdString) rc = cmd.get_results().rc result = cmd.get_results().stdout tinctest.logger.info('Command returning, rc: %s, result: %s'%(rc,result)) def get_default_fs_loc(self, role='m', content=0): fs_sql = '''select fselocation from pg_filespace_entry where fsefsoid = 3052 and fsedbid = (select dbid from gp_segment_configuration where role = \'%s\' and content = %s);'''%(role,content) result = PSQL.run_sql_command(fs_sql, flags = '-q -t', dbname= 'template1') result = result.strip() filespace_loc = result.split('\n') return filespace_loc[0] def gpconfig_alter(self,type,bool): ''' Alter postgres configuration ''' if bool == 'true': fault_string = "filerep_inject_listener_fault=true" elif bool == 'false': fault_string = "filerep_inject_listener_fault=false" for record in self.gpconfig.record: if type == 'primary': if record.role and record.content != -1: fse_location = record.datadir else: continue if type == 'mirror': if (not record.role) and record.content != -1: fse_location = record.datadir else: continue run_shell_command('ssh ' + record.hostname + ' \'echo '+fault_string + ' >> ' + fse_location +'/postgresql.conf\'') tinctest.logger.info( "\n ssh %s 'echo %s >> %s/postgresql.conf'" % (record.hostname, fault_string, fse_location)) tinctest.logger.info( "\n Done set %s in postgresql.conf on all primary segments" % fault_string) def set_faults(self,fault_name, type, role='mirror', port=None, occurence=None, sleeptime=None, seg_id=None): ''' Reset the fault and then issue the fault with the given type''' self.fileutil.inject_fault(f=fault_name, y=type, r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id) def resume_faults(self,fault_name, role='mirror'): ''' Resume the fault issues ''' self.fileutil.inject_fault(f=fault_name, y='resume', r=role) def run_validation(self): tinctest.logger.info('Veriy the integrity between primary and mirror ...') self.dbstate = DbStateClass('run_validation') self.dbstate.check_mirrorintegrity() def incremental_recoverseg(self, workerPool=False): self.gprecover.incremental(workerPool) def run_recoverseg_if_ct(self): num_down = self.gpconfig.count_of_nodes_in_mode('c') if (int(num_down) > 0): self.incremental_recoverseg() def wait_till_change_tracking(self): self.fileutil.wait_till_change_tracking_transition() def wait_till_insync(self): self.gprecover.wait_till_insync_transition() def run_gpstate(self, type, phase): self.gpstate.run_gpstate(type, phase) def run_gpprimarymirror(self): self.gpprimarymirror.run_gpprimarymirror() def verify_gpprimarymirror_output(self, total_resync=0, cur_resync=0): status = self.gpprimarymirror.verify_gpprimarymirror_output(total_resync, cur_resync) self.assertTrue(status, 'Total and Cur resync object count mismatch') def run_gpstate_shell_cmd(self, options): self.gpstate.run_gpstate_shell_cmd(options) def verify_gpstate_output(self): status = self.gpstate.verify_gpstate_output() self.assertTrue(status, 'Total and Cur resync object count mismatch') def run_trigger_sql(self): ''' Run a sql statement to trigger postmaster reset ''' PSQL.run_sql_file(local_path('test_ddl.sql')) def run_fts_test_ddl_dml(self): PSQL.run_sql_file(local_path('fts_test_ddl_dml.sql')) def run_fts_test_ddl_dml_before_ct(self): PSQL.run_sql_file(local_path('fts_test_ddl_dml_before_ct.sql')) def run_fts_test_ddl_dml_ct(self): PSQL.run_sql_file(local_path('fts_test_ddl_dml_ct.sql')) def run_sql_in_background(self): PSQL.run_sql_command('drop table if exists bar; create table bar(i int);', background=True) def sleep_for_transition(self): #gp_segment_connect_timeout is set to 10s , still need a little more time than that to complete the transition to ct sleep(100) def restart_db(self): self.base.stop_db() self.base.start_db() def stop_db_with_no_rc_check(self): ''' Gpstop and dont check for rc ''' cmd = Command('Gpstop_a', 'gpstop -a') tinctest.logger.info('Executing command: gpstop -a') cmd.run() def start_db_with_no_rc_check(self): ''' Gpstart and dont check for rc ''' cmd = Command('Gpstart_a', 'gpstart -a') tinctest.logger.info('Executing command: gpstart -a') cmd.run() def restart_db_with_no_rc_check(self): self.stop_db_with_no_rc_check() self.start_db_with_no_rc_check() def set_gpconfig(self, param, value): ''' Set the configuration parameter using gpconfig ''' command = "gpconfig -c %s -v %s --skipvalidation " % (param, value) run_shell_command(command) self.restart_db() def check_db(self): checkDBUp() def check_fault_status(self, fault_name, seg_id=None, role=None): status = self.fileutil.check_fault_status(fault_name = fault_name, status ='triggered', max_cycle=20, role=role, seg_id=seg_id) self.assertTrue(status, 'The fault is not triggered in the time expected') def cluster_state(self): state = self.gpconfig.is_not_insync_segments() self.assertTrue(state,'The cluster is not up and in sync')
def test_gpcheckcat(self): tinctest.logger.info('Run Checkcat to verify persistent table consistency') dbstate = DbStateClass('run_validation') dbstate.check_catalog(alldb = False, dbname = Steps.dbname)
class FilerepTestCase(MPPTestCase): def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.util = Filerepe2e_Util() self.gpconfig = GpConfig() self.config = GPDBConfig() self.gpr = GpRecover(self.config) self.dbstate = DbStateClass('run_validation',self.config) self.gpstart = GpStart() self.gpstop = GpStop() super(FilerepTestCase,self).__init__(methodName) def sleep(self, seconds=60): time.sleep(seconds) def create_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def remove_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def get_timestamp_of_file_in_datadir(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) res = cmd.get_results().stdout.strip() return res def verify_file_exists(self, content, role, filename): dbid = self.config.get_dbid(content=content, seg_role=role) host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid) file_path = os.path.join(datadir, filename) cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host) cmd.run(validateAfter=True) def handle_ext_cases(self,file): """ @file: wet sql file to replace with specific machine env. """ host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP querystring = "gpfdist://"+host+":8088" if os.path.isfile(file): for line in fileinput.FileInput(file,inplace=1): line = re.sub('gpfdist.+8088',querystring,line) print str(re.sub('\n','',line)) def handle_hybrid_part_cases(self, file): """ @file: hybrid sql file to replace with specific machine env """ querystring = "FROM '"+local_path('hybrid_part.data')+"'" if os.path.isfile(file): for line in fileinput.FileInput(file,inplace=1): line = re.sub('FROM\s\'.+hybrid_part.data\'',querystring,line) print str(re.sub('\n','',line)) def preprocess(self): """ Replace the hard-coded information from sql files with correct hostname and ip address,etc """ list_workload_dir = ['set_sync1','sync1','set_ck_sync1','ck_sync1', 'set_ct','ct','set_resync','resync','set_sync2','sync2'] for dir in list_workload_dir: sql_path = os.path.join(local_path(dir),'sql') ans_path = os.path.join(local_path(dir),'expected') for file in os.listdir(sql_path): if (file.find('wet_ret')>=0): self.handle_ext_cases(os.path.join(sql_path,file)) if (file.find('hybrid_part')>=0): self.handle_hybrid_part_cases(os.path.join(sql_path,file)) for file in os.listdir(ans_path): if (file.find('wet_ret')>=0): self.handle_ext_cases(os.path.join(ans_path,file)) if (file.find('hybrid_part')>=0): self.handle_hybrid_part_cases(os.path.join(ans_path,file)) def clean_data(self): """ Clean the data by removing the external table, otherwise, more data will be appended to the same external table from running multiple sql files. """ test = local_path("") test = str(test) +"data/*.*" cmd = 'rm -rfv '+test run_shell_command(cmd) def anydownsegments(self): """ checks if any segments are down """ tinctest.logger.info("Checking if any segments are down") num_segments_down = self.count_of_nodes_down() if int(num_segments_down) == 0: return True else: return False def stop_start_validate(self, stopValidate=True): """ Do gpstop -i, gpstart and see if all segments come back up fine """ tinctest.logger.info("Performing stop start validate") tinctest.logger.info("Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate = 'i', validate=stopValidate) if not ok and stopValidate: raise Exception('Problem while shutting down the cluster') tinctest.logger.info("Successfully shutdown the cluster.") tinctest.logger.info("Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failed to bring the cluster back up') tinctest.logger.info("Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("segments were marked down") else: return (True, "All segments are up") def method_reset_fault_injection(self): """ Resets fault injection Return: (True, [result]) if OK, or (False, [result]) otherwise """ tinctest.logger.info("Resetting fault injection") (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to reset resync") return (True, str(out1)) def method_resume_filerep_resync(self): """ Resumes the process of resync """ tinctest.logger.info("Resuming Resync") (ok, out) = self.util.inject_fault(f='filerep_resync', m='async',y='resume', r='primary', H='ALL') if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done resuming resync") return (ok, out) def run_method_suspendresync(self): """ Stops the cluster from going to resync """ tinctest.logger.info("Suspending resync") (ok,out) = self.util.inject_fault(f='filerep_resync', m='async' , y='suspend', r ='primary', H='ALL') tinctest.logger.info('output from suspend resync %s'%out) if not ok: raise Exception("Fault injection failed") tinctest.logger.info("Done Injecting Fault to suspend resync") return (ok, out) def count_of_masters(self): """ Gives count of number of nodes in the cluster that are master Return: count of number of nodes in the cluster that are master """ tinctest.logger.info("Count the number of masters") cmd = "select count(*) from gp_segment_configuration where content = -1" (out) = PSQL.run_sql_command(cmd) num_master = out.split('\n')[3].strip() return num_master def count_of_nodes(self): """ Gives count of number of nodes in the cluster Return: count of number of nodes in the cluster """ tinctest.logger.info("Counting number of nodes") cmd = "select count(*) from gp_segment_configuration" (num_cl) = PSQL.run_sql_command(cmd) total_num_rows = num_cl.split('\n')[3].strip() return total_num_rows def count_of_nodes_in_ct(self): """ Gives count of number of nodes in change tracking Return: count of number of nodes in change tracking """ tinctest.logger.info("Counting number of nodes in ct") sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'" (num_cl) = PSQL.run_sql_command(sqlcmd) num_cl = num_cl.split('\n')[3].strip() return num_cl def count_of_nodes_down(self): """ Gives count of number of nodes marked as down Return: count of number of nodes marked as down """ tinctest.logger.info("Counting the number of nodes down") sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'" (num_down) = PSQL.run_sql_command(sqlcmd) num_down = num_down.split('\n')[3].strip() return num_down def count_of_nodes_sync(self): """ Gives count of number of nodes in sync Return: count of number of nodes in sync """ tinctest.logger.info("Counting the number of nodes in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def count_of_nodes_not_sync(self): """ Gives count of number of nodes not in sync Return: count of number of nodes not in sync """ tinctest.logger.info("Counting number of nodes not in sync") sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'" (num_sync) = PSQL.run_sql_command(sqlcmd) num_sync = num_sync.split('\n')[3].strip() return num_sync def inject_fault_on_first_primary(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ tinctest.logger.info("\n Injecting faults on first primary") (ok,out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async' , y='infinite_loop', r ='primary', seg_id=2, sleeptime=300) if not ok: raise Exception("Fault filerep_immediate_shutdown_request injection failed") (ok,out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async' , y='infinite_loop', r ='primary', seg_id=2) if not ok: raise Exception("Fault fileRep_is_operation_completed injection failed") tinctest.logger.info("\n Done Injecting Fault") def inject_fault_on_first_mirror(self): """ @product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2] """ sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'" (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd) first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip() tinctest.logger.info("\n Injecting faults on first mirror") flag = self.util.check_fault_status(fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100); if not flag: raise Exception("Fault fileRep_is_operation_completed didn't trigger") (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='panic', r ='mirror', seg_id=first_mirror_dbid) if not ok: raise Exception("Fault filerep_consumer injection failed") tinctest.logger.info("\n Done Injecting Fault") def setupGpfdist(self, port, path): gpfdist = Gpfdist(port , self.hostIP()) gpfdist.killGpfdist() gpfdist.startGpfdist(' -t 30 -m 1048576 -d '+path) return True def cleanupGpfdist(self, port,path): gpfdist = Gpfdist(port , self.hostIP()) gpfdist.killGpfdist() return True def hostIP(self): ok = run_shell_command('which gpfdist') if not ok: raise GPtestError("Error:'which gpfdist' command failed.") hostname = socket.gethostname() if hostname.find('mdw') > 0 : host = 'mdw' else: host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP tinctest.logger.info('current host is %s'%host) return host def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs=Gpfilespace() gpfs.create_filespace('filerep_fs_a') gpfs.create_filespace('filerep_fs_b') gpfs.create_filespace('filerep_fs_c') gpfs.create_filespace('filerep_fs_z') gpfs.create_filespace('sync1_fs_1') # Set max_resource_queues to 100 cmd = 'gpconfig -c max_resource_queues -v 100 ' ok = run_shell_command(cmd) if not ok: raise Exception('Failure during setting the max_resource_queues value to 100 using gpconfig tool') #Restart the cluster self.gpstop.run_gpstop_cmd(immediate = 'i') ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('Failure during restarting the cluster') return True def get_ext_table_query_from_gpstate(self): outfile = local_path("gpstate_tmp") ok = run_shell_command("gpstate --printSampleExternalTableSql >"+ outfile) querystring = "" flag = 'false' out = open(outfile, 'r').readlines() for line in out: line.strip() if (line.find('DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status')>=0): flag = 'true' if flag == 'true': querystring = querystring + line return querystring ############RUN QYUERY def check_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ if phase == 'sync1': state_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'") sync1_num = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1") if int(sync1_num) <> int(state_num): raise Exception("gpstate in Sync state failed") tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) elif phase == 'ct': p_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking' and role = 'Primary' and status_in_config='Up' and instance_status='Up'") m_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync' and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' ") if int(p_num) <> int(m_num): raise Exception("gpstate in CT state failed") tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) elif phase == 'resync_incr': if type == 'primary': query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) else: query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'" resync_incr_num = self.query_select_count(query) query_num_rows = "select count(*) from gp_segment_configuration where content <> -1" num_rows = self.query_select_count(query_num_rows) if int(resync_incr_num) <> int(num_rows): tinctest.logger.info("resync_incr_num query run %s" % query) tinctest.logger.info("num_rows query run %s" % query_num_rows) raise Exception("gpstate in Resync Incremental state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows)) tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) elif phase == 'resync_full': num_rows = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1") if type == 'primary': resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'") else: resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and status_in_config='Up' and instance_status='Up' and resync_mode= 'Full'") if int(resync_full_num) <> int(num_rows): raise Exception("gptate in Resync Full state failed") tinctest.logger.info("Done Running gpstate in %s phase " %(phase)) return True def trigger_transition(self): PSQL.run_sql_file(local_path('mirrors.sql')) def run_gpstate(self, type, phase): """ Perform gpstate for each different transition state @type: failover type @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2 """ tinctest.logger.info("running gpstate") querystring = self.get_ext_table_query_from_gpstate() file1 = local_path('create_table_gpstate.sql') f1 = open(file1,'w') f1.write(querystring) f1.write('\n') f1.close() PSQL.run_sql_file(local_path('create_table_gpstate.sql')) gpstate_outfile = local_path('gpstate_out') cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile) ok = run_shell_command(cmd) self.check_gpstate(type, phase) return ok def check_mirror_seg(self, master=False): tinctest.logger.info("running check mirror") self.dbstate.check_mirrorintegrity() def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): tinctest.logger.info("running gpcheckcat") self.dbstate.check_catalog(outputFile=outputFile) def query_select_count(self,sqlcmd): (num) = PSQL.run_sql_command(sqlcmd) num = num.split('\n')[3].strip() return num def method_run_failover(self,type): """ Inject fault to failover nodes @type: primary [induces fault in mirror] mirror [creates panic in primary] Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise """ if type == 'primary': tinctest.logger.info("\n primary failover") (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='fault', r ='mirror', H='ALL') tinctest.logger.info("\n Done Injecting Fault") elif type == 'mirror': tinctest.logger.info("\n Mirror failover") (ok,out) = self.util.inject_fault(f='postmaster', m='async' , y='panic', r ='primary', H='ALL') tinctest.logger.info("\n Done Injecting Fault") return True def wait_till_change_tracking_transition(self): self.util.wait_till_change_tracking_transition() def wait_till_insync_transition(self): self.gpr.wait_till_insync_transition() def run_gprecoverseg(self,recover_mode): if recover_mode == 'full': self.gpr.full() else: self.gpr.incremental() def run_gpconfig(self, parameter, master_value, segment_value): if (parameter is not None): self.gpconfig.setParameter(parameter, master_value, segment_value) self.gpstop.run_gpstop_cmd(restart='r') def inject_fault(self, fault = None, mode = None, operation = None, prim_mirr = None, host = 'All', table = None, database = None, seg_id = None, sleeptime = None, occurence = None): if (fault == None or mode == None or operation == None or prim_mirr == None): raise Exception('Incorrect parameters provided for inject fault') (ok,out) = self.util.inject_fault(f=fault, m=mode , y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
def check_mirror_seg(self): dbstate = DbStateClass('run_validation') dbstate.check_mirrorintegrity(master=True)
class SubTransactionLimitRemovalTestCase(MPPTestCase): def __init__(self, methodName): super(SubTransactionLimitRemovalTestCase,self).__init__(methodName) def check_system(self): ''' @summary: Check whether the system is up and sync. Exit out if not ''' tinctest.logger.info("[STLRTest] Running check_system") tinctest.logger.info("[STLRTest] Check whether the system is up and sync") cmd ="select count(*) from gp_segment_configuration where content<> -1 ;" (num_cl) = PSQL.run_sql_command(cmd) count_all = num_cl.split('\n')[3].strip() cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';" (num_cl) = PSQL.run_sql_command(cmd) count_up_and_sync = num_cl.split('\n')[3].strip() tinctest.logger.info("[STLRTest] printing gp segment configuration") (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid") tinctest.logger.info(gp_seg_conf) if count_all != count_up_and_sync : raise Exception("[STLRTest] System not in sync and up. Exiting test") else: tinctest.logger.info("[STLRTest] Starting New Test: System is up and in sync...") def run_sqls(self,test): ''' @summary : Run the sql @param test: the sql file list ''' tinctest.logger.info("[STLRTest] Running run_sqls") tinctest.logger.info("[STLRTest]Starting new thread to run sql %s"%(test)) PSQL.run_sql_file(local_path(test)) def suspend_faults(self,fault_name): ''' @summary : Suspend the specified fault: reset it before issuing suspend @param fault_name : Name of the fault to suspend ''' tinctest.logger.info("[STLRTest] Running suspend_faults") self.util = Filerepe2e_Util() (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the %s fault"%(fault_name)) (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'suspend', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done suspending the %s fault"%(fault_name)) def check_fault_status(self,fault_name = None, status = None, max_cycle=10): ''' Check whether a fault is triggered. Poll till the fault is triggered @param name : Fault name @param status : Status to be checked - triggered/completed ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running check_fault_status %s", status) if (not fault_name) or (not status) : raise Exception("[STLRTest]Need a value for fault_name and status to continue") poll =0 while(poll < max_cycle): (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'status', r = 'primary', H ='ALL') poll +=1 for line in out1.splitlines(): if line.find(fault_name) > 0 and line.find(status) > 0 : tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name,status)) poll = 0 tinctest.logger.info("[STLRTest] Running check_fault_status %s TRUE", status) return True #sleep a while before start polling again sleep(10) tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status) return False def filerep_fault(self,trans_state): ''' @summary : Inject the filerep fault supplied @param trans_state : type of transition ''' tinctest.logger.info("[STLRTest] Running filerep_fault") self.util = Filerepe2e_Util() if trans_state == 'failover_to_primary': tinctest.logger.info("[STLRTest] primary failover") (ok1,out1) = self.util.inject_fault(f='filerep_consumer', m = 'async', y = 'fault', r = 'mirror', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done primary failover fault") elif trans_state == 'failover_to_mirror': tinctest.logger.info("[STLRTest] fault for postmaster panic") (ok1,out1) = self.util.inject_fault(f='postmaster', m = 'async', y = 'panic', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done postmaster panic fault") elif trans_state == 'postmaster_reset': tinctest.logger.info("[STLRTest] fault for filerep_sender panic") (ok1,out1) = self.util.inject_fault(f='filerep_sender', m = 'async', y = 'panic', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_sender panic fault") tinctest.logger.info("[STLRTest] Done Injecting Fault") def resume_faults(self,fault_name,trans_state): '''' @summary : Resume the fault and check status ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_faults") if not trans_state == 'failover_to_mirror' : tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name) (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault resume failed") tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name) if trans_state == 'postmaster_reset': (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'mirror', H ='ALL') if not ok1: tinctest.logger.info("[STLRTest]Failed fault for %s resume on mirror" % fault_name) if trans_state == 'failover_to_primary' : self.check_fault_status(fault_name,'completed') def checkPSQLRun(self, test): '''Check if the psql run started in parallel is over before running the _post.sql ''' tinctest.logger.info("[STLRTest] Running checkPSQLRun") cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql' while(1): is_running = 0 (rc , out) = shell.run(cmd_str) for line in out: if '%s' %test in line: is_running = 1 if is_running == 0: return True else: sleep(10) return False def resume_filerep_resync(self): self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running resume_filerep_resync") tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume") (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'resume', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done fault for failover_to_mirror resume") sleep(10) def stop_start_validate(self, expect_down_segments=False): """ Do gpstop -i, gpstart and see if all segments come back up fine """ self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running stop_start_validate") tinctest.logger.info("[STLRTest]Shutting down the cluster") ok = self.gpstop.run_gpstop_cmd(immediate = 'i') if not expect_down_segments: if not ok: raise Exception('[STLRTest]Problem while shutting down the cluster') tinctest.logger.info("[STLRTest]Successfully shutdown the cluster.") tinctest.logger.info("[STLRTest]Restarting the cluster.") ok = self.gpstart.run_gpstart_cmd() if not ok: raise Exception('[STLRTest]Failed to bring the cluster back up') tinctest.logger.info("[STLRTest]Successfully restarted the cluster.") if not self.anydownsegments(): raise Exception("[STLRTest]segments were marked down") else: return (True, "All segments are up") def run_gprecoverseg(self,recover_option): ''' @summary : Call gpecoverseg full or incremental to bring back the cluster to sync ''' self.gpr = GpRecover() tinctest.logger.info("[STLRTest] Running run_gprecoverseg") if recover_option == 'full': self.gpr.full() else: self.gpr.incremental() self.gpr.wait_till_insync_transition() def run_restart_database(self): ''' @summary : Restart the database ''' self.gpstart = GpStart() self.gpstop = GpStop() tinctest.logger.info("[STLRTest] Running run_restart_database") ok = self.gpstop.run_gpstop_cmd(immediate = 'i') tinctest.logger.info(ok) ok = self.gpstart.run_gpstart_cmd() tinctest.logger.info(ok) def reset_faults(self,fault_name,current_cluster_state): '''' @summary : Reset the faults at the end of test ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running reset_faults") tinctest.logger.info("[STLRTest] Resetting fault before ending test") (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting %s fault" %(fault_name)) if current_cluster_state == 'resync': (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done filerep_resync fault") (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting checkpoint fault" ) def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None): self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running do_gpcheckcat") self.dbstate.check_catalog() return True def _validation(self): ''' @summary :gpcheckcat and gpcheckmirrorintegrity ''' ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;") ###sleep(30) # sleep for some time for the segments to be in sync before validation self.dbstate = DbStateClass('run_validation') tinctest.logger.info("[STLRTest] Running _validation") outfile = local_path("subt_checkcat.out") self.dbstate.check_catalog(outputFile=outfile) self.dbstate.check_mirrorintegrity() def inject_and_resume_fault(self, fault_name, trans_state): self.check_fault_status(fault_name, 'triggered') self.filerep_fault(trans_state) if trans_state == 'failover_to_mirror' : PSQL.run_sql_file(local_path('test_while_ct.sql')) self.resume_faults(fault_name, trans_state) def run_post_sqls(self, fault_name ='', trans_state=''): PSQL.wait_for_database_up(); if (trans_state == 'failover_to_primary' or trans_state == ''): post_sql = "failover_sql/subt_create_table_ao_post_commit" else: post_sql = "failover_sql/subt_create_table_ao_post_abort" sql_file = post_sql+".sql" ans_file = post_sql+".ans" out_file = post_sql+".out" PSQL.run_sql_file(sql_file = local_path(sql_file), out_file = local_path(out_file)) diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file)) if not diff_res: self.fail("[STLRTest]Gpdiff failed for : %s %s" %(fault_name, trans_state)) def reset_all_faults(self): '''' @summary : Reset all faults on primary and mirror ''' tinctest.logger.info("[STLRTest] Running reset_all_faults") self.util = Filerepe2e_Util() (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting all faults on primary") (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'mirror', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting all faults fault on mirror") def kill_zombies(self): ''' @summary : There are stray zombie processes running after each test. This method clears them ''' tinctest.logger.info("[STLRTest] Running kill_zombies") cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1" cmd = Command("shell_command", cmd_str) tinctest.logger.info('Executing command: %s : %s' %("shell command", cmd_str)) cmd.run() result = cmd.get_results() out = result.stdout lines = out.split('\n') for line in lines: pids = line.split('#') if pids[0] == '1': kill_str= "kill -9 %s" %(pids[1]) cmd2 = Command("kill_command", kill_str) cmd2.run() def skip_checkpoint(self): ''' @summary : Routine to inject fault that skips checkpointing ''' self.util = Filerepe2e_Util() tinctest.logger.info("[STLRTest] Running skip_checkpoint") (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault") (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'skip', r = 'primary', H ='ALL') if not ok1: raise Exception("[STLRTest]Fault injection failed") tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault") def method_setup(self): tinctest.logger.info("Performing setup tasks") gpfs=Gpfilespace() gpfs.create_filespace('subt_filespace_a') def cleandb(self): db = Database() db.setupDatabase('gptest')
class AOCOAlterColumn(MPPTestCase): def __init__(self): self.fileutil = Filerepe2e_Util() self.gprecover = GpRecover() self.config = GpConfig() self.base_dir = os.path.dirname(sys.modules[self.__class__.__module__].__file__) def get_sql_files(self, sql_file_name): sql_file = os.path.join( self.base_dir, "sql", sql_file_name + ".sql"); return sql_file def validate_sql(self, ans_file, out_file): ''' Compare the out and ans files ''' init_file=os.path.join( self.base_dir, "sql",'init_file') result1 = Gpdiff.are_files_equal(out_file, ans_file, match_sub =[init_file]) self.assertTrue(result1 ,'Gpdiff.are_files_equal') def run_sql(self, filename, out_file,background=False): ''' Run the provided sql and validate it ''' out_file = local_path(filename.replace(".sql", ".out")) PSQL.run_sql_file(filename,out_file=out_file,background=background) def run_test_CatalogCheck(self, action,storage): file_name =action+'_'+storage sql_file = self.get_sql_files(file_name) out_file = self.base_dir+ "/sql/"+file_name+'.out' tinctest.logger.info( 'sql-file == %s \n' % sql_file) tinctest.logger.info( 'out-file == %s \n' % out_file) # Run Add/Drop Column script self.run_sql(sql_file, out_file=out_file) def validate_test_CatalogCheck(self, action,storage): file_name =action+'_'+storage out_file = self.base_dir+ "/sql/"+file_name+'.out' ans_file = self.base_dir+ "/expected/"+file_name+'.ans' tinctest.logger.info( 'out-file == %s \n' % out_file) tinctest.logger.info( 'ans-file == %s \n' % ans_file) # Validate Ans file self.validate_sql(ans_file,out_file) if storage == 'multisegfiles': ''' check if multi_segfile_tab file has multiple segfiles per column ''' tablename='multi_segfile_tab' relid = self.get_relid(file_name=tablename ) utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid) u_port=utilitymodeinfo[0] u_host=utilitymodeinfo[1] assert(1 < int(self.get_segment_cnt(relid=relid,host=u_host,port= u_port))) # Check Correctness of the catalog self.dbstate = DbStateClass('run_validation') outfile = local_path("gpcheckcat_"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')+".out") self.dbstate.check_catalog(outputFile=outfile) def run_test_ChangeTracking(self,filename): # Log the segment state before starting the test # Expectation is a SYNC state self.log_segment_state() primary_dbid=self.get_dbid() # Run the 'alter table add column cmmand' in the background self.run_sql_ChangeTracking(filename,stage='fail',validate=False,background=True) # Inject Fault to put one primary in panic self.fileutil.inject_fault(f='postmaster', y='reset', seg_id=primary_dbid) self.fileutil.inject_fault(f='postmaster', y='panic', seg_id=primary_dbid) state=self.fileutil.check_fault_status(fault_name='postmaster', status='triggered') self.log_segment_state() # Recover the down segments self.recover_seg() self.log_segment_state() # Validate that the previous alter failed because primary segment went down as the alter was taking place self.run_sql_ChangeTracking(filename,stage='failvalidate',validate=True,background=False) # Now the system is in change tracking so the next alter should pass self.run_sql_ChangeTracking(filename,stage='pass',validate=True,background=False) self.log_segment_state() def recover_seg(self): result=self.get_segcount_state(state='d') if result > 0: if not self.gprecover.incremental(): raise Exception('Gprecoverseg failed') if not self.gprecover.wait_till_insync_transition(): raise Exception('Segments not in sync') tinctest.logger.info('Segments recovered and back in sync') def run_sql_ChangeTracking(self,filename,stage,validate=False,background=False): fname=filename+'-'+stage sql_file = self.get_sql_files(fname) out_file = self.base_dir+ "/sql/"+fname +'.out' ans_file = self.base_dir+ "/expected/"+fname+'.ans' tinctest.logger.info( '\n==============stage = %s ================' % (stage)) tinctest.logger.info( sql_file) tinctest.logger.info( out_file) tinctest.logger.info( ans_file) tinctest.logger.info( '==============================') result=self.run_sql(sql_file,out_file=out_file,background=background) if validate == True: self.validate_sql(ans_file,out_file) def get_dbid(self): sql_cmd = "select min(dbid) dbid from gp_segment_configuration where role = 'p' and status = 'u' and content > -1" dbid=PSQL.run_sql_command(sql_cmd= sql_cmd,flags='-q -t') tinctest.logger.info('Segments %s chosen for fault injection' % (dbid)) return dbid def log_segment_state(self): sql_cmd = "select * from gp_segment_configuration order by dbid" result=PSQL.run_sql_command(sql_cmd= sql_cmd) tinctest.logger.info('==========================') tinctest.logger.info('State of Segments ') tinctest.logger.info(result) tinctest.logger.info('==========================') def get_segcount_state(self,state): sql_cmd = "select count(*) from gp_segment_configuration where status = '%s'" % (state) result=PSQL.run_sql_command(sql_cmd= sql_cmd,flags='-q -t') tinctest.logger.info('Number of segments in %s State == %d' % (state,(int(result)))) return int(result) def get_utilitymode_conn_info(self, relid=0): #get the segment_id where to log in utility mode and then get the hostname and port for this segment sql_cmd="select port, hostname from gp_segment_configuration sc where dbid > 1 and role = 'p' limit 1;" utilitymodeinfo=PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t') u_port=utilitymodeinfo.strip().split('|')[0] u_host=utilitymodeinfo.strip().split('|')[1] return [u_port,u_host] def get_relid(self,file_name=None): sql_cmd="SELECT oid FROM pg_class WHERE relname='%s';\n" % file_name relid= PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t') return relid; def get_segment_cnt(self, relid=0,host=None,port=None): sql_cmd="select count(*) from gp_toolkit.__gp_aocsseg(%s) group by column_num having count(*) > 1 limit 1" % (relid) segcnt=PSQL.run_sql_command_utility_mode(sql_cmd=sql_cmd,host=host, port=port,flags='-q -t') if (len(segcnt.strip()) == 0): segcnt='0' return segcnt def run_test_utility_mode(self,filename): #alter_aoco_tab_utilitymode relid = self.get_relid(file_name=filename ) utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid) u_port=utilitymodeinfo[0] u_host=utilitymodeinfo[1] self.run_sql_utility_mode(filename,host=u_host,port=u_port) def run_sql_utility_mode(self,filename,host=None,port=None): fname=filename sql_file = self.get_sql_files(fname) out_file = self.base_dir+ "/sql/"+fname +'.out' ans_file = self.base_dir+ "/expected/"+fname+'.ans' tinctest.logger.info( '\n==============================') tinctest.logger.info( sql_file) tinctest.logger.info( out_file) tinctest.logger.info( ans_file) tinctest.logger.info( '==============================') result=PSQL.run_sql_file_utility_mode(sql_file,out_file=out_file,host=host, port=port) self.validate_sql(ans_file,out_file)
def test_gpcheckcat(self): dbstate = DbStateClass('run_validation') dbstate.check_catalog(alldb=False, dbname=Verification.dbname)
def test_mpp23395(self): """ @description Test MPP-20964, uncleaned lock table by pg_terminate_backend @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99] """ self.util = Filerepe2e_Util() (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared") # setup PSQL.run_sql_command(""" DROP TABLE IF EXISTS mpp23395; """) # Scenario 1: FAULT during Create Table on master sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1); # Scenario 2: FAULT during Drop Table on master, COMMIT case sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared") # Scenario 3: FAULT during Create Table on segment, COMMIT case sql = ''' SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2); # Scenario 4: FAULT during Drop Table on segment, COMMIT case sql = ''' SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; DROP TABLE mpp23395; ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2); # Scenario 5: FAULT during Create Table on master, ABORT case (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1); if not ok: raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared") sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared") PSQL.run_sql_command(""" CREATE TABLE mpp23395(a int); """) # Scenario 6: FAULT during Drop Table on master, ABORT case (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1); if not ok: raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared") sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1); (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1); if not ok: raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared") PSQL.run_sql_command(""" DROP TABLE mpp23395; """) dbstate = DbStateClass('run_validation') dbstate.check_catalog()
def check_mirror_seg(self, master=False): tinctest.logger.info("running check mirror") dbstate = DbStateClass('run_validation') dbstate.check_mirrorintegrity(master=master)
class FtsTransitions(MPPTestCase): def __init__(self, methodName): self.pgport = os.environ.get('PGPORT') self.fileutil = Filerepe2e_Util() self.gpconfig = GPDBConfig() self.gprecover = GpRecover(self.gpconfig) self.gpstate = Gpstate() self.gpprimarymirror = Gpprimarymirror() self.base = GPDBStorageBaseTestCase(self.gpconfig) super(FtsTransitions, self).__init__(methodName) def kill_first_mirror(self): mirror_data_loc = self.get_default_fs_loc(role='m', content=0) (host, port) = self.gpconfig.get_hostandport_of_segment(psegmentNumber=0, pRole='m') cmdString = 'ps -ef|grep -v grep|grep \'%s\'|awk \'{print $2}\'|xargs kill -9' % mirror_data_loc remote = Command(name='kill first mirror', cmdStr=cmdString, ctxt=2, remoteHost=host) remote.run() tinctest.logger.info('run command %s' % cmdString) rc = remote.get_results().rc result = remote.get_results().stdout tinctest.logger.info('Command returning, rc: %s, result: %s' % (rc, result)) def kill_master_process(self, ProcName=None): cmdString = 'ps -ef|grep postgres| grep %s | grep \'%s\'| awk \'{print $2}\'|xargs kill -9' % ( self.pgport, ProcName) cmd = Command('kill process on master', cmdStr=cmdString) cmd.run() tinctest.logger.info('run command %s' % cmdString) rc = cmd.get_results().rc result = cmd.get_results().stdout tinctest.logger.info('Command returning, rc: %s, result: %s' % (rc, result)) def get_default_fs_loc(self, role='m', content=0): fs_sql = '''select fselocation from pg_filespace_entry where fsefsoid = 3052 and fsedbid = (select dbid from gp_segment_configuration where role = \'%s\' and content = %s);''' % (role, content) result = PSQL.run_sql_command(fs_sql, flags='-q -t', dbname='template1') result = result.strip() filespace_loc = result.split('\n') return filespace_loc[0] def gpconfig_alter(self, type, bool): ''' Alter postgres configuration ''' if bool == 'true': fault_string = "filerep_inject_listener_fault=true" elif bool == 'false': fault_string = "filerep_inject_listener_fault=false" for record in self.gpconfig.record: if type == 'primary': if record.role and record.content != -1: fse_location = record.datadir else: continue if type == 'mirror': if (not record.role) and record.content != -1: fse_location = record.datadir else: continue run_shell_command('ssh ' + record.hostname + ' \'echo ' + fault_string + ' >> ' + fse_location + '/postgresql.conf\'') tinctest.logger.info( "\n ssh %s 'echo %s >> %s/postgresql.conf'" % (record.hostname, fault_string, fse_location)) tinctest.logger.info( "\n Done set %s in postgresql.conf on all primary segments" % fault_string) def set_faults(self, fault_name, type, role='mirror', port=None, occurence=None, sleeptime=None, seg_id=None): ''' Reset the fault and then issue the fault with the given type''' self.fileutil.inject_fault(f=fault_name, y=type, r=role, p=port, o=occurence, sleeptime=sleeptime, seg_id=seg_id) def resume_faults(self, fault_name, role='mirror'): ''' Resume the fault issues ''' self.fileutil.inject_fault(f=fault_name, y='resume', r=role) def run_validation(self): tinctest.logger.info( 'Veriy the integrity between primary and mirror ...') self.dbstate = DbStateClass('run_validation') self.dbstate.check_mirrorintegrity() def incremental_recoverseg(self, workerPool=False): self.gprecover.incremental(workerPool) def run_recoverseg_if_ct(self): num_down = self.gpconfig.count_of_nodes_in_mode('c') if (int(num_down) > 0): self.incremental_recoverseg() def wait_till_change_tracking(self): self.fileutil.wait_till_change_tracking_transition() def wait_till_insync(self): self.gprecover.wait_till_insync_transition() def run_gpstate(self, type, phase): self.gpstate.run_gpstate(type, phase) def run_gpprimarymirror(self): self.gpprimarymirror.run_gpprimarymirror() def verify_gpprimarymirror_output(self, total_resync=0, cur_resync=0): status = self.gpprimarymirror.verify_gpprimarymirror_output( total_resync, cur_resync) self.assertTrue(status, 'Total and Cur resync object count mismatch') def run_gpstate_shell_cmd(self, options): self.gpstate.run_gpstate_shell_cmd(options) def verify_gpstate_output(self): status = self.gpstate.verify_gpstate_output() self.assertTrue(status, 'Total and Cur resync object count mismatch') def run_trigger_sql(self): ''' Run a sql statement to trigger postmaster reset ''' PSQL.run_sql_file(local_path('test_ddl.sql')) def run_fts_test_ddl_dml(self): PSQL.run_sql_file(local_path('fts_test_ddl_dml.sql')) def run_fts_test_ddl_dml_before_ct(self): PSQL.run_sql_file(local_path('fts_test_ddl_dml_before_ct.sql')) def run_fts_test_ddl_dml_ct(self): PSQL.run_sql_file(local_path('fts_test_ddl_dml_ct.sql')) def run_sql_in_background(self): PSQL.run_sql_command( 'drop table if exists bar; create table bar(i int);', background=True) def sleep_for_transition(self): #gp_segment_connect_timeout is set to 10s , still need a little more time than that to complete the transition to ct sleep(100) def restart_db(self): self.base.stop_db() self.base.start_db() def stop_db_with_no_rc_check(self): ''' Gpstop and dont check for rc ''' cmd = Command('Gpstop_a', 'gpstop -a') tinctest.logger.info('Executing command: gpstop -a') cmd.run() def start_db_with_no_rc_check(self): ''' Gpstart and dont check for rc ''' cmd = Command('Gpstart_a', 'gpstart -a') tinctest.logger.info('Executing command: gpstart -a') cmd.run() def restart_db_with_no_rc_check(self): self.stop_db_with_no_rc_check() self.start_db_with_no_rc_check() def set_gpconfig(self, param, value): ''' Set the configuration parameter using gpconfig ''' command = "gpconfig -c %s -v %s --skipvalidation " % (param, value) run_shell_command(command) self.restart_db() def check_db(self): checkDBUp() def check_fault_status(self, fault_name, seg_id=None, role=None): status = self.fileutil.check_fault_status(fault_name=fault_name, status='triggered', max_cycle=20, role=role, seg_id=seg_id) self.assertTrue(status, 'The fault is not triggered in the time expected') def cluster_state(self): state = self.gpconfig.is_not_insync_segments() self.assertTrue(state, 'The cluster is not up and in sync')
class SuspendCheckpointCrashRecovery(MPPTestCase): def __init__(self,methodName): self.fileutil = Filerepe2e_Util() self.config = GPDBConfig() self.gprecover = GpRecover(self.config) self.gpstart = GpStart() self.gpstop = GpStop() self.gpfile = Gpfilespace(self.config) self.dbstate = DbStateClass('run_validation', self.config) self.port = os.getenv('PGPORT') self.base = GPDBStorageBaseTestCase() super(SuspendCheckpointCrashRecovery,self).__init__(methodName) def check_system(self): ''' @summary: Check whether the system is up and sync. Exit out if not ''' cmd ="select count(*) from gp_segment_configuration where content<> -1 ;" count_all = PSQL.run_sql_command(cmd, flags ='-q -t', dbname='postgres') cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';" count_up_and_sync = PSQL.run_sql_command(cmd, flags ='-q -t', dbname='postgres') if count_all.strip() != count_up_and_sync.strip() : os._exit(1) else: tinctest.logger.info("\n Starting New Test: System is up and in sync .........") def get_items_list(self, tests): ''' Get file contents to a list ''' test_file = local_path(tests) with open(test_file, 'r') as f: test_list = [line.strip() for line in f] return test_list def checkPSQLRun(self, test): '''Check if the psql run started in background is over before running the _post.sql ''' cmd_str = "ps -ef|grep '%s'|grep [p]sql" % test while(1): is_running = 0 cmd = Command('Check psql run', cmd_str) cmd.run() result = cmd.get_results() for line in result.stdout.splitlines(): if '%s' %test in line: tinctest.logger.info(line) is_running = 1 if is_running == 0: return True else: sleep(5) return False def modify_sql_file(self, filename): ans_file = local_path(filename.replace('.sql' , '.ans')) for sfile in (filename, ans_file): for line in fileinput.FileInput(sfile,inplace=1): line = re.sub('gptest', os.getenv('PGDATABASE'), line) print str(re.sub('\n','',line)) def validate_sql(self, filename): ''' Compare the out and ans files ''' out_file = local_path(filename.replace(".sql", ".out")) ans_file = local_path(filename.replace('.sql' , '.ans')) assert Gpdiff.are_files_equal(out_file, ans_file) def run_sql(self, filename): ''' Run the provided sql and validate it ''' out_file = local_path(filename.replace(".sql", ".out")) PSQL.run_sql_file(sql_file = filename, out_file = out_file) self.validate_sql(filename) def set_faults_before_executing_pre_sqls(self, cluster_state): ''' Set the checkpoint skip fault ''' if cluster_state == 'change_tracking': self.cluster_in_change_tracking() self.fileutil.inject_fault(f='checkpoint', y='reset', r='primary', p=self.port) self.fileutil.inject_fault(f='checkpoint', y='skip', r='primary', p=self.port, o='0') tinctest.logger.info('Successfully injected fault to skip checkpointing') if(cluster_state == 'resync'): self.fileutil.inject_fault(f='filerep_consumer', y='reset') self.fileutil.inject_fault(f='filerep_consumer', y='fault') self.fileutil.wait_till_change_tracking_transition() def suspend_fault(self, fault_name): ''' Suspend the provided fault_name ''' self.fileutil.inject_fault(f='%s' % fault_name, y='reset', o='0', r='primary', p=self.port) self.fileutil.inject_fault(f='%s' % fault_name, y='suspend', o='0', r='primary', p=self.port) tinctest.logger.info('Successfully injected fault to suspend %s' % fault_name) def get_faults_before_executing_trigger_sqls(self, pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False): ''' Get the fault before trigger sqls are executed ''' fault_name='' tinctest.logger.info('Fault Conditions: pass_num = [%s], cluster_state = [%s], test_type = [%s], ddl_type = [%s], aborting_create_needed = [%s]' % (pass_num, cluster_state, test_type, ddl_type, aborting_create_needed)) if pass_num == 1 and test_type == 'commit' and ddl_type == 'create': if aborting_create_needed: fault_name = 'finish_prepared_transaction_commit_pass1_aborting_create_needed' else: fault_name = 'finish_prepared_transaction_commit_pass1_from_create_pending_to_created' elif pass_num == 2 and test_type == 'commit' and ddl_type == 'create': if aborting_create_needed: fault_name = 'finish_prepared_transaction_commit_pass2_aborting_create_needed' else: fault_name = 'finish_prepared_transaction_commit_pass2_from_create_pending_to_created' elif pass_num == 1 and test_type == 'commit' and ddl_type == 'drop': fault_name = 'finish_prepared_transaction_commit_pass1_from_drop_in_memory_to_drop_pending' elif pass_num == 2 and test_type == 'commit' and ddl_type == 'drop': fault_name = 'finish_prepared_transaction_commit_pass2_from_drop_in_memory_to_drop_pending' elif pass_num == 1 and test_type == 'abort': if aborting_create_needed: fault_name = 'finish_prepared_transaction_abort_pass1_aborting_create_needed' else: fault_name = 'finish_prepared_transaction_abort_pass1_from_create_pending_to_aborting_create' elif pass_num == 2 and test_type == 'abort': if aborting_create_needed: fault_name = 'finish_prepared_transaction_abort_pass2_aborting_create_needed' else: fault_name = 'finish_prepared_transaction_abort_pass2_from_create_pending_to_aborting_create' elif pass_num == 0 and (test_type == 'abort' or test_type == 'commit'): pass # We already set the fault error_txn_abort_after_dist_prepare_on_master above for abort tests and for commit tests skip checkpoint is done by default for all tests. return fault_name def set_faults_before_executing_trigger_sqls(self, pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False): ''' Set the fault before trigger sqls are executed ''' if (cluster_state == 'resync'): self.cluster_in_resync() fault_name='' fault_name = self.get_faults_before_executing_trigger_sqls(pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False); if (test_type == 'abort'): self.fileutil.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', p=self.port, o='0', seg_id=1) self.fileutil.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', p=self.port, o='0', seg_id=1) tinctest.logger.info('Successfully injected fault to error out after distributed prepare for abort tests') if pass_num !=0 : self.suspend_fault(fault_name) elif pass_num == 0 : fault_name = None if (cluster_state == 'resync'): self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y = 'reset', r = 'primary') self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y = 'suspend', r = 'primary') tinctest.logger.info('Successfully suspended filerep_transition_to_sync_begin') #Resume resync so that trigger sql can execute while resync is in progress self.fileutil.inject_fault(f='filerep_resync', y = 'resume', r = 'primary') return fault_name def cluster_in_resync(self): ''' 1. Suspend filerep_resync, 2. Suspend filerep_transition_to_sync_before_checkpoint, 3. Run gprecoverseg ''' self.base.invoke_fault('filerep_resync', 'suspend', role='primary') self.base.invoke_fault('filerep_transition_to_sync_before_checkpoint', 'suspend', role='primary', port=self.port , occurence='0') rc = self.gprecover.incremental() if not rc: raise Exception('Gprecvoerseg failed') tinctest.logger.info('Cluster in resync state') def switch_primary_mirror_role_in_utility_mode(self): '''Utility routine to start the master, connect in utility mode, switch the roles of primary and mirrors and shutdown the master ''' cmd = Command('Start master in utility mode', 'export GPSTART_INTERNAL_MASTER_ONLY=1;gpstart -m') cmd.run(validateAfter=True) result = cmd.get_results() if result.rc != 0: raise Exception('Unable to start master in utility mode') tinctest.logger.info('Started master in utility mode') sql_cmd_list = ["update gp_segment_configuration set role='t' where role ='p' and content <> -1", "update gp_segment_configuration set role='p',mode='c' where role ='m' and content <> -1", "update gp_segment_configuration set role='m',status='d' where role ='t' and content <> -1"] for sql_cmd in sql_cmd_list: PSQL.run_sql_command(sql_cmd, PGOPTIONS="-c gp_session_role=utility -c allow_system_table_mods=dml") tinctest.logger.info('Updated the catalog to reverse the roles') rc = self.gpstop.run_gpstop_cmd(masteronly = True) if not rc: raise Exception('Failure to shut down the master') def stop_db(self): ''' gpstop immediate''' rc = self.gpstop.run_gpstop_cmd(immediate = True) if not rc: raise Exception('Failed to stop the cluster') tinctest.logger.info('Stopped cluster immediately') def start_db(self, down_segments=False): ''' Gpstart -a ''' rc = self.gpstart.run_gpstart_cmd() if not rc: raise Exception('Failed to start the cluster') tinctest.logger.info('Started the cluster successfully') if not down_segments: if self.config.is_down_segments(): raise Exception('Segments got marked down') ''' This is sleep free version based on fault triggered status ''' def run_crash_and_recovery_fast(self,test_dir, pass_num, cluster_state, test_type, ddl_type, aborting_create_needed=False): if pass_num == 0: self.wait_till_all_sqls_done() else: mydir=local_path(test_dir)+'/trigger_sql/sql/' tinctest.logger.info('mydir = %s ' % mydir) trigger_count = len(glob.glob1(mydir,"*trigger.sql")) tinctest.logger.info('*** Count of trigger : %s *** ' % (trigger_count)) if test_dir == "abort_create_tests": ''' vacuum full sql don't hit the suspend fault.''' trigger_count = trigger_count - 1 if test_dir == "abort_create_needed_tests": ''' Not all SQLs hit the fault for this case, hence wait for them to complete and then others to hit the fault''' self.wait_till_all_sqls_done(8 + 1) trigger_count = 8 if test_dir == "abort_abort_create_needed_tests": ''' Not all SQLs hit the fault for this case, hence wait for them to complete and then others to hit the fault''' self.wait_till_all_sqls_done(6 + 1) trigger_count = 6 fault_type = self.get_faults_before_executing_trigger_sqls(pass_num, cluster_state, test_type, ddl_type, aborting_create_needed=False) fault_hit = self.fileutil.check_fault_status(fault_name=fault_type, status="triggered", num_times_hit=trigger_count) if not fault_hit: raise Exception('Fault not hit expected number of times') self.stop_start_validate(cluster_state) def wait_till_all_sqls_done(self, count=1): ''' 500 here is just an arbitrarily long time "if-we-exceed-this-then-oh-crap-lets-error-out" value ''' for i in range(1,500): psql_count = PSQL.run_sql_command("select count(*) from pg_stat_activity where current_query <> '<IDLE>'", flags='-q -t', dbname='postgres') if int(psql_count.strip()) <= count : return sleep(1) raise Exception('SQLs expected to complete but are still running') def stop_start_validate(self, cluster_state): ''' Do gpstop immediate, gpstart and see if all segments come back up fine ''' if cluster_state == 'sync' : self.stop_db() self.switch_primary_mirror_role_in_utility_mode() tinctest.logger.info('Successfully switched roles of primary and mirrors in gp_segment_configuration') self.start_db(down_segments=True) rc = self.gprecover.incremental() if not rc: raise Exception('Gprecoverseg failed') if not self.gprecover.wait_till_insync_transition(): raise Exception('Segments not in sync') if cluster_state == 'change_tracking': self.stop_db() self.start_db(down_segments=True) if cluster_state == 'resync': #Resume the filerep_resync filerep_transition_to_sync_begin before stop-start self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y='resume', r='primary') self.stop_db() self.start_db() if not self.gprecover.wait_till_insync_transition(): raise Exception('Segments not in sync') self.dbstate.check_catalog(alldb=False) def cluster_in_change_tracking(self): ''' Put Cluster into change_tracking ''' self.base.invoke_fault('filerep_consumer', 'fault', role='primary') self.fileutil.wait_till_change_tracking_transition() tinctest.logger.info('Change_tracking transition complete') def validate_system(self, cluster_state): # Validate the system's integrity if (cluster_state == 'change_tracking'): if not self.gprecover.incremental(): raise Exception('Gprecoverseg failed') if not self.gprecover.wait_till_insync_transition(): raise Exception('Segments not in sync') tinctest.logger.info('Segments recovered and back in sync') self.dbstate.check_mirrorintegrity() if self.config.has_master_mirror(): self.dbstate.check_mirrorintegrity(master=True) def run_fault_injector_to_skip_checkpoint(self): tinctest.logger.info('Skip Checkpointing using fault injector.') self.fileutil.inject_fault(y = 'reset', f = 'checkpoint', r ='primary', H='ALL', m ='async', o = '0', p=self.port) (ok, out) = self.fileutil.inject_fault(y = 'skip', f = 'checkpoint', r ='primary', H='ALL', m ='async', o = '0', p=self.port) if not ok: raise Exception('Problem with injecting fault.') def backup_output_dir(self,test_dir, test_id): indir=local_path(test_dir) outdir = indir+'_'+test_id cmdstr="cp -r "+ indir + " " + outdir cmd = Command(name='run cp -r ', cmdStr=cmdstr) tinctest.logger.info("Taking a backup of SQL directory: %s" %cmd) try: cmd.run() except: self.fail("cp -r failed.") tinctest.logger.info("Test SQL directory Backup Done!!") def do_post_run_checks(self): self.stop_start_validate('sync') rc = self.gprecover.incremental() if not rc: raise Exception('Gprecvoerseg failed') self.gprecover.wait_till_insync_transition() tinctest.logger.info("Done going from resync to insync") self.dbstate.check_catalog(alldb=False) self.dbstate.check_mirrorintegrity() if self.config.has_master_mirror(): self.dbstate.check_mirrorintegrity(master=True)
def test_gpcheckcat(self): tinctest.logger.info( 'Run Checkcat to verify persistent table consistency') dbstate = DbStateClass('run_validation') dbstate.check_catalog(alldb=False, dbname=Steps.dbname)
def run_validation(self): tinctest.logger.info( 'Veriy the integrity between primary and mirror ...') self.dbstate = DbStateClass('run_validation') self.dbstate.check_mirrorintegrity()
def test_mpp23395(self): """ @description Test MPP-20964, uncleaned lock table by pg_terminate_backend @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99] """ self.util = Filerepe2e_Util() (ok, out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1) if not ok: raise Exception( "Failed to reset the fault dtm_broadcast_commit_prepared") # setup PSQL.run_sql_command(""" DROP TABLE IF EXISTS mpp23395; """) # Scenario 1: FAULT during Create Table on master sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1) # Scenario 2: FAULT during Drop Table on master, COMMIT case sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1) (ok, out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1) if not ok: raise Exception( "Failed to reset the fault dtm_broadcast_commit_prepared") # Scenario 3: FAULT during Create Table on segment, COMMIT case sql = ''' SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2) # Scenario 4: FAULT during Drop Table on segment, COMMIT case sql = ''' SET debug_dtm_action_target = "protocol"; SET debug_dtm_action_protocol = "commit_prepared"; SET debug_dtm_action_segment = 0; SET debug_dtm_action = "fail_begin_command"; DROP TABLE mpp23395; ''' self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2) # Scenario 5: FAULT during Create Table on master, ABORT case (ok, out) = self.util.inject_fault( f='transaction_abort_after_distributed_prepared', y='error', seg_id=1) if not ok: raise Exception( "Failed to set the error fault for transaction_abort_after_distributed_prepared" ) sql = ''' CREATE TABLE mpp23395(a int); ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1) (ok, out) = self.util.inject_fault( f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1) if not ok: raise Exception( "Failed to reset the fault transaction_abort_after_distributed_prepared" ) PSQL.run_sql_command(""" CREATE TABLE mpp23395(a int); """) # Scenario 6: FAULT during Drop Table on master, ABORT case (ok, out) = self.util.inject_fault( f='transaction_abort_after_distributed_prepared', y='error', seg_id=1) if not ok: raise Exception( "Failed to set the error fault for transaction_abort_after_distributed_prepared" ) sql = ''' DROP TABLE mpp23395; ''' self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1) (ok, out) = self.util.inject_fault( f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1) if not ok: raise Exception( "Failed to reset the fault transaction_abort_after_distributed_prepared" ) PSQL.run_sql_command(""" DROP TABLE mpp23395; """) dbstate = DbStateClass('run_validation') dbstate.check_catalog()
class AOCOAlterColumn(MPPTestCase): def __init__(self): self.fileutil = Filerepe2e_Util() self.gprecover = GpRecover() self.config = GpConfig() self.base_dir = os.path.dirname( sys.modules[self.__class__.__module__].__file__) def get_sql_files(self, sql_file_name): sql_file = os.path.join(self.base_dir, "sql", sql_file_name + ".sql") return sql_file def validate_sql(self, ans_file, out_file): ''' Compare the out and ans files ''' init_file = os.path.join(self.base_dir, "sql", 'init_file') result1 = Gpdiff.are_files_equal(out_file, ans_file, match_sub=[init_file]) self.assertTrue(result1, 'Gpdiff.are_files_equal') def run_sql(self, filename, out_file, background=False): ''' Run the provided sql and validate it ''' out_file = local_path(filename.replace(".sql", ".out")) PSQL.run_sql_file(filename, out_file=out_file, background=background) def run_test_CatalogCheck(self, action, storage): file_name = action + '_' + storage sql_file = self.get_sql_files(file_name) out_file = self.base_dir + "/sql/" + file_name + '.out' tinctest.logger.info('sql-file == %s \n' % sql_file) tinctest.logger.info('out-file == %s \n' % out_file) # Run Add/Drop Column script self.run_sql(sql_file, out_file=out_file) def validate_test_CatalogCheck(self, action, storage): file_name = action + '_' + storage out_file = self.base_dir + "/sql/" + file_name + '.out' ans_file = self.base_dir + "/expected/" + file_name + '.ans' tinctest.logger.info('out-file == %s \n' % out_file) tinctest.logger.info('ans-file == %s \n' % ans_file) # Validate Ans file self.validate_sql(ans_file, out_file) if storage == 'multisegfiles': ''' check if multi_segfile_tab file has multiple segfiles per column ''' tablename = 'multi_segfile_tab' relid = self.get_relid(file_name=tablename) utilitymodeinfo = self.get_utilitymode_conn_info(relid=relid) u_port = utilitymodeinfo[0] u_host = utilitymodeinfo[1] assert (1 < int( self.get_segment_cnt(relid=relid, host=u_host, port=u_port))) # Check Correctness of the catalog self.dbstate = DbStateClass('run_validation') outfile = local_path("gpcheckcat_" + datetime.datetime.fromtimestamp( time.time()).strftime('%Y%m%d%H%M%S') + ".out") self.dbstate.check_catalog(outputFile=outfile) def get_dbid(self): sql_cmd = "select min(dbid) dbid from gp_segment_configuration where role = 'p' and status = 'u' and content > -1" dbid = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t') tinctest.logger.info('Segments %s chosen for fault injection' % (dbid)) return dbid def log_segment_state(self): sql_cmd = "select * from gp_segment_configuration order by dbid" result = PSQL.run_sql_command(sql_cmd=sql_cmd) tinctest.logger.info('==========================') tinctest.logger.info('State of Segments ') tinctest.logger.info(result) tinctest.logger.info('==========================') def get_segcount_state(self, state): sql_cmd = "select count(*) from gp_segment_configuration where status = '%s'" % ( state) result = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t') tinctest.logger.info('Number of segments in %s State == %d' % (state, (int(result)))) return int(result) def get_utilitymode_conn_info(self, relid=0): #get the segment_id where to log in utility mode and then get the hostname and port for this segment sql_cmd = "select port, hostname from gp_segment_configuration sc where dbid > 1 and role = 'p' limit 1;" utilitymodeinfo = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t') u_port = utilitymodeinfo.strip().split('|')[0] u_host = utilitymodeinfo.strip().split('|')[1] return [u_port, u_host] def get_relid(self, file_name=None): sql_cmd = "SELECT oid FROM pg_class WHERE relname='%s';\n" % file_name relid = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t') return relid def get_segment_cnt(self, relid=0, host=None, port=None): sql_cmd = "select count(*) from gp_toolkit.__gp_aocsseg(%s) group by column_num having count(*) > 1 limit 1" % ( relid) segcnt = PSQL.run_sql_command_utility_mode(sql_cmd=sql_cmd, host=host, port=port, flags='-q -t') if (len(segcnt.strip()) == 0): segcnt = '0' return segcnt def run_test_utility_mode(self, filename): #alter_aoco_tab_utilitymode relid = self.get_relid(file_name=filename) utilitymodeinfo = self.get_utilitymode_conn_info(relid=relid) u_port = utilitymodeinfo[0] u_host = utilitymodeinfo[1] self.run_sql_utility_mode(filename, host=u_host, port=u_port) def run_sql_utility_mode(self, filename, host=None, port=None): fname = filename sql_file = self.get_sql_files(fname) out_file = self.base_dir + "/sql/" + fname + '.out' ans_file = self.base_dir + "/expected/" + fname + '.ans' tinctest.logger.info('\n==============================') tinctest.logger.info(sql_file) tinctest.logger.info(out_file) tinctest.logger.info(ans_file) tinctest.logger.info('==============================') result = PSQL.run_sql_file_utility_mode(sql_file, out_file=out_file, host=host, port=port) self.validate_sql(ans_file, out_file)