Example #1
0
 def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
     self.dbstate = DbStateClass('run_validation')
     tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
     self.dbstate.check_catalog()
     (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid")
     tinctest.logger.info(gp_seg_conf)
     return True
Example #2
0
 def __init__(self, methodName):
     self.pgport = os.environ.get('PGPORT')
     self.util = Filerepe2e_Util()
     self.gpconfig = GpConfig()
     self.config = GPDBConfig()
     self.gpr = GpRecover(self.config)
     self.dbstate = DbStateClass('run_validation', self.config)
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     super(FilerepTestCase, self).__init__(methodName)
Example #3
0
 def do_gpcheckcat(self,
                   dbname=None,
                   alldb=False,
                   online=False,
                   outputFile='checkcat.out',
                   outdir=None):
     self.dbstate = DbStateClass('run_validation')
     tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
     self.dbstate.check_catalog()
     return True
Example #4
0
 def __init__(self,methodName):
     self.fileutil = Filerepe2e_Util()
     self.config = GPDBConfig()
     self.gprecover = GpRecover(self.config)
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     self.gpfile = Gpfilespace(self.config)
     self.dbstate = DbStateClass('run_validation', self.config)
     self.port = os.getenv('PGPORT')
     self.base = GPDBStorageBaseTestCase()
     super(SuspendCheckpointCrashRecovery,self).__init__(methodName)
Example #5
0
    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''

        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation

        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)

        self.dbstate.check_mirrorintegrity()
Example #6
0
 def __init__(self, methodName):    
     self.pgport = os.environ.get('PGPORT')
     self.util = Filerepe2e_Util()
     self.gpconfig = GpConfig()
     self.config = GPDBConfig()
     self.gpr = GpRecover(self.config)
     self.dbstate = DbStateClass('run_validation',self.config)
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     super(FilerepTestCase,self).__init__(methodName)
Example #7
0
 def __init__(self, methodName):
     self.filereputil = Filerepe2e_Util()
     self.config = GPDBConfig()
     self.gprecover = GpRecover(self.config)
     self.gpstop = GpStop()
     self.gpstart = GpStart()
     self.gpverify = GpdbVerify(config=self.config)
     self.dbstate = DbStateClass('run_validation', self.config)
     self.port = os.getenv('PGPORT')
     super(PgtwoPhaseClass, self).__init__(methodName)
Example #8
0
 def validate_test_CatalogCheck(self, action,storage):
     file_name =action+'_'+storage
     out_file = self.base_dir+ "/sql/"+file_name+'.out'
     ans_file = self.base_dir+ "/expected/"+file_name+'.ans'
     tinctest.logger.info( 'out-file == %s \n' % out_file)
     tinctest.logger.info( 'ans-file == %s \n' % ans_file)
     # Validate Ans file
     self.validate_sql(ans_file,out_file)
     if storage == 'multisegfiles':
         ''' check if multi_segfile_tab file has  multiple segfiles per column '''
         tablename='multi_segfile_tab'
         relid = self.get_relid(file_name=tablename )
         utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid)
         u_port=utilitymodeinfo[0]
         u_host=utilitymodeinfo[1]
         assert(1 < int(self.get_segment_cnt(relid=relid,host=u_host,port= u_port)))
     # Check Correctness of the catalog
     self.dbstate = DbStateClass('run_validation')
     outfile = local_path("gpcheckcat_"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')+".out")
     self.dbstate.check_catalog(outputFile=outfile)
Example #9
0
    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = GPDBConfig()

        self.filereputil = Filerepe2e_Util()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')
Example #10
0
    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''
        
        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation
 
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)
         
        self.dbstate.check_mirrorintegrity()
Example #11
0
 def validate_test_CatalogCheck(self, action,storage):
     file_name =action+'_'+storage
     out_file = self.base_dir+ "/sql/"+file_name+'.out'
     ans_file = self.base_dir+ "/expected/"+file_name+'.ans'
     tinctest.logger.info( 'out-file == %s \n' % out_file)
     tinctest.logger.info( 'ans-file == %s \n' % ans_file)
     # Validate Ans file
     self.validate_sql(ans_file,out_file)
     if storage == 'multisegfiles':
         ''' check if multi_segfile_tab file has  multiple segfiles per column '''
         tablename='multi_segfile_tab'
         relid = self.get_relid(file_name=tablename )
         utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid)
         u_port=utilitymodeinfo[0]
         u_host=utilitymodeinfo[1]
         assert(1 < int(self.get_segment_cnt(relid=relid,host=u_host,port= u_port)))
     # Check Correctness of the catalog
     self.dbstate = DbStateClass('run_validation')
     outfile = local_path("gpcheckcat_"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')+".out")
     self.dbstate.check_catalog(outputFile=outfile)
Example #12
0
 def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
     self.dbstate = DbStateClass('run_validation')
     tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
     self.dbstate.check_catalog()
     return True
Example #13
0
 def run_validation(self):
     tinctest.logger.info('Veriy the integrity between primary and mirror ...')
     self.dbstate = DbStateClass('run_validation')
     self.dbstate.check_mirrorintegrity()
Example #14
0
class SubTransactionLimitRemovalTestCase(MPPTestCase):
    def __init__(self, methodName):
        super(SubTransactionLimitRemovalTestCase, self).__init__(methodName)

    def check_system(self):
        '''
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        tinctest.logger.info("[STLRTest] Running check_system")

        tinctest.logger.info(
            "[STLRTest] Check whether the system is up and sync")

        cmd = "select count(*) from gp_segment_configuration where content<> -1 ;"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_all = num_cl.split('\n')[3].strip()

        cmd = "select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_up_and_sync = num_cl.split('\n')[3].strip()
        tinctest.logger.info("[STLRTest] printing gp segment configuration")
        (gp_seg_conf) = PSQL.run_sql_command(
            "select * from gp_segment_configuration order by dbid")
        tinctest.logger.info(gp_seg_conf)

        if count_all != count_up_and_sync:
            raise Exception(
                "[STLRTest] System not in sync and up. Exiting test")
        else:
            tinctest.logger.info(
                "[STLRTest] Starting New Test: System is up and in sync...")

    def run_sqls(self, test):
        '''
        @summary : Run the sql 
        @param test: the sql file list
        '''
        tinctest.logger.info("[STLRTest] Running run_sqls")
        tinctest.logger.info("[STLRTest]Starting new thread to run sql %s" %
                             (test))
        PSQL.run_sql_file(local_path(test))

    def suspend_faults(self, fault_name):
        '''
        @summary : Suspend the specified fault: reset it before issuing suspend 
        @param fault_name : Name of the fault to suspend
        '''
        tinctest.logger.info("[STLRTest] Running suspend_faults")

        self.util = Filerepe2e_Util()

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the %s fault" %
                             (fault_name))

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='suspend',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done suspending the %s fault" %
                             (fault_name))

    def check_fault_status(self, fault_name=None, status=None, max_cycle=10):
        ''' 
        Check whether a fault is triggered. Poll till the fault is triggered
        @param name : Fault name
        @param status : Status to be checked - triggered/completed
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running check_fault_status %s",
                             status)

        if (not fault_name) or (not status):
            raise Exception(
                "[STLRTest]Need a value for fault_name and status to continue")

        poll = 0
        while (poll < max_cycle):
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='status',
                                                 r='primary',
                                                 H='ALL')
            poll += 1
            for line in out1.splitlines():
                if line.find(fault_name) > 0 and line.find(status) > 0:
                    tinctest.logger.info("[STLRTest]Fault %s is %s " %
                                         (fault_name, status))
                    poll = 0
                    tinctest.logger.info(
                        "[STLRTest] Running check_fault_status %s TRUE",
                        status)
                    return True

            #sleep a while before start polling again
            sleep(10)
        tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE",
                             status)
        return False

    def filerep_fault(self, trans_state):
        '''
        @summary : Inject the filerep fault supplied
        @param trans_state : type of transition 
        '''
        tinctest.logger.info("[STLRTest] Running filerep_fault")
        self.util = Filerepe2e_Util()

        if trans_state == 'failover_to_primary':
            tinctest.logger.info("[STLRTest] primary failover")
            (ok1, out1) = self.util.inject_fault(f='filerep_consumer',
                                                 m='async',
                                                 y='fault',
                                                 r='mirror',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done primary failover fault")

        elif trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for postmaster panic")
            (ok1, out1) = self.util.inject_fault(f='postmaster',
                                                 m='async',
                                                 y='panic',
                                                 r='primary',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done postmaster panic fault")

        elif trans_state == 'postmaster_reset':
            tinctest.logger.info("[STLRTest] fault for filerep_sender panic")
            (ok1, out1) = self.util.inject_fault(f='filerep_sender',
                                                 m='async',
                                                 y='panic',
                                                 r='primary',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done filerep_sender panic fault")

        tinctest.logger.info("[STLRTest] Done Injecting Fault")

    def resume_faults(self, fault_name, trans_state):
        ''''
        @summary : Resume the fault and check status
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_faults")

        if not trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name)
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='resume',
                                                 r='primary',
                                                 H='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault resume failed")
            tinctest.logger.info("[STLRTest]Done fault for %s resume" %
                                 fault_name)

        if trans_state == 'postmaster_reset':
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='resume',
                                                 r='mirror',
                                                 H='ALL')
            if not ok1:
                tinctest.logger.info(
                    "[STLRTest]Failed fault for %s resume on mirror" %
                    fault_name)

        if trans_state == 'failover_to_primary':
            self.check_fault_status(fault_name, 'completed')

    def checkPSQLRun(self, test):
        '''Check if the psql run started in parallel is over before running the _post.sql '''
        tinctest.logger.info("[STLRTest] Running checkPSQLRun")
        cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql'
        while (1):
            is_running = 0
            (rc, out) = shell.run(cmd_str)
            for line in out:
                if '%s' % test in line:
                    is_running = 1
            if is_running == 0:
                return True
            else:
                sleep(10)
        return False

    def resume_filerep_resync(self):
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_filerep_resync")

        tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume")
        (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                             m='async',
                                             y='resume',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info(
            "[STLRTest]Done fault for failover_to_mirror resume")
        sleep(10)

    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        if not expect_down_segments:
            if not ok:
                raise Exception(
                    '[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info(
                "[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("[STLRTest]segments were marked down")
        else:
            return (True, "All segments are up")

    def run_gprecoverseg(self, recover_option):
        '''
        @summary : Call gpecoverseg full or incremental to bring back the cluster to sync
        '''
        self.gpr = GpRecover()

        tinctest.logger.info("[STLRTest] Running run_gprecoverseg")

        if recover_option == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

        self.gpr.wait_till_insync_transition()

    def run_restart_database(self):
        '''
        @summary : Restart the database
        '''
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running run_restart_database")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        tinctest.logger.info(ok)
        ok = self.gpstart.run_gpstart_cmd()
        tinctest.logger.info(ok)

    def reset_faults(self, fault_name, current_cluster_state):
        ''''
        @summary : Reset the faults at the end of test 
        '''
        self.util = Filerepe2e_Util()
        tinctest.logger.info("[STLRTest] Running reset_faults")

        tinctest.logger.info("[STLRTest] Resetting fault before ending test")

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting %s fault" %
                             (fault_name))

        if current_cluster_state == 'resync':
            (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                                 m='async',
                                                 y='reset',
                                                 r='primary',
                                                 H='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done filerep_resync fault")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting checkpoint fault")

    def do_gpcheckcat(self,
                      dbname=None,
                      alldb=False,
                      online=False,
                      outputFile='checkcat.out',
                      outdir=None):
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
        self.dbstate.check_catalog()
        return True

    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''

        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation

        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)

        self.dbstate.check_mirrorintegrity()

    def inject_and_resume_fault(self, fault_name, trans_state):
        self.check_fault_status(fault_name, 'triggered')
        self.filerep_fault(trans_state)
        if trans_state == 'failover_to_mirror':
            PSQL.run_sql_file(local_path('test_while_ct.sql'))
        self.resume_faults(fault_name, trans_state)

    def run_post_sqls(self, fault_name='', trans_state=''):
        PSQL.wait_for_database_up()
        if (trans_state == 'failover_to_primary' or trans_state == ''):
            post_sql = "failover_sql/subt_create_table_ao_post_commit"
        else:
            post_sql = "failover_sql/subt_create_table_ao_post_abort"

        sql_file = post_sql + ".sql"
        ans_file = post_sql + ".ans"
        out_file = post_sql + ".out"

        PSQL.run_sql_file(sql_file=local_path(sql_file),
                          out_file=local_path(out_file))
        diff_res = Gpdiff.are_files_equal(local_path(out_file),
                                          local_path(ans_file))

        if not diff_res:
            self.fail("[STLRTest]Gpdiff failed for : %s %s" %
                      (fault_name, trans_state))

    def reset_all_faults(self):
        ''''
        @summary : Reset all faults on primary and mirror 
        '''
        tinctest.logger.info("[STLRTest] Running reset_all_faults")
        self.util = Filerepe2e_Util()

        (ok1, out1) = self.util.inject_fault(f='all',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting all faults on primary")

        (ok1, out1) = self.util.inject_fault(f='all',
                                             m='async',
                                             y='reset',
                                             r='mirror',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info(
            "[STLRTest]Done resetting all faults fault on mirror")

    def kill_zombies(self):
        ''' 
        @summary : There are stray zombie processes running after each test. This method clears them 
        '''
        tinctest.logger.info("[STLRTest] Running kill_zombies")
        cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1"
        cmd = Command("shell_command", cmd_str)
        tinctest.logger.info('Executing command: %s : %s' %
                             ("shell command", cmd_str))
        cmd.run()
        result = cmd.get_results()
        out = result.stdout
        lines = out.split('\n')
        for line in lines:
            pids = line.split('#')
            if pids[0] == '1':
                kill_str = "kill -9 %s" % (pids[1])
                cmd2 = Command("kill_command", kill_str)
                cmd2.run()

    def skip_checkpoint(self):
        ''' 
        @summary : Routine to inject fault that skips checkpointing 
        '''

        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running skip_checkpoint")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='skip',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault")

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs = Gpfilespace()
        gpfs.create_filespace('subt_filespace_a')

    def cleandb(self):
        db = Database()
        db.setupDatabase('gptest')
Example #15
0
File: verify.py Project: 50wu/gpdb
 def test_gpcheckcat(self):
     dbstate = DbStateClass('run_validation')
     dbstate.check_catalog(alldb = False, dbname = Verification.dbname)
 def test_verify_catalog(self):
     dbstate = DbStateClass('run_validation')
     dbstate.check_catalog(alldb=False, dbname=catalog_consistency.dbname)
Example #17
0
    def test_mpp23395(self):
        """
        
        @description Test MPP-20964, uncleaned lock table by pg_terminate_backend
        @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99]
        """
        self.util = Filerepe2e_Util()

        (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared")
 
        # setup
        PSQL.run_sql_command("""
          DROP TABLE IF EXISTS mpp23395;
          """)

        # Scenario 1: FAULT during Create Table on master
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1);

        # Scenario 2: FAULT during Drop Table on master, COMMIT case
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared")

        # Scenario 3: FAULT during Create Table on segment, COMMIT case
        sql = '''
        SET dtx_phase2_retry_count = 1;
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2);

        # Scenario 4: FAULT during Drop Table on segment, COMMIT case
        sql = '''
        SET dtx_phase2_retry_count = 1;
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2);

        # Scenario 5: FAULT during Create Table on master, ABORT case
        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1);
        if not ok:
           raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared")
 
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared")


        PSQL.run_sql_command("""
        CREATE TABLE mpp23395(a int);
          """)

        # Scenario 6: FAULT during Drop Table on master, ABORT case
        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1);
        if not ok:
           raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared")
 
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared")

        PSQL.run_sql_command("""
        DROP TABLE mpp23395;
          """)


        # Scenario 7: FAULT during Create Table on segment, COMMIT case, succeeds on second retry
        sql = '''
        DROP TABLE IF EXISTS mpp23395;
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2, False);

        # QE panics after writing prepare xlog record.  This should cause
        # master to broadcast abort but QEs handle the abort in
        # DTX_CONTEXT_LOCAL_ONLY context.
        sql = '''
        DROP TABLE IF EXISTS mpp23395;
        CREATE TABLE mpp23395(a int);
        INSERT INTO mpp23395 VALUES(1), (2), (3);
        BEGIN;
        SET debug_abort_after_segment_prepared = true;
        DELETE FROM mpp23395;
        COMMIT;
        '''

        # No prepared transactions should remain lingering
        PSQL.run_sql_command(sql)
        self.check_no_dangling_prepared_transaction()

        dbstate = DbStateClass('run_validation')
        dbstate.check_catalog()
Example #18
0
class FilerepTestCase(MPPTestCase):
    def __init__(self, methodName):
        self.pgport = os.environ.get('PGPORT')
        self.util = Filerepe2e_Util()
        self.gpconfig = GpConfig()
        self.config = GPDBConfig()
        self.gpr = GpRecover(self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FilerepTestCase, self).__init__(methodName)

    def sleep(self, seconds=60):
        time.sleep(seconds)

    def create_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('create a file',
                      'touch %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def remove_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('remove a file',
                      'rm %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def get_timestamp_of_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command(
            'check timestamp',
            """ python -c "import os; print os.stat('%s').st_mtime" """ %
            file_path,
            ctxt=REMOTE,
            remoteHost=host)
        cmd.run(validateAfter=True)
        res = cmd.get_results().stdout.strip()
        return res

    def verify_file_exists(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check if file exists',
                      'test -f %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def handle_ext_cases(self, file):
        """
        @file: wet sql file to replace with specific machine env.
        """

        host = str(socket.gethostbyname(socket.gethostname()))  #Must be an IP
        querystring = "gpfdist://" + host + ":8088"

        if os.path.isfile(file):
            for line in fileinput.FileInput(file, inplace=1):
                line = re.sub('gpfdist.+8088', querystring, line)
                print str(re.sub('\n', '', line))

    def handle_hybrid_part_cases(self, file):
        """
        @file: hybrid sql file to replace with specific machine env
        """

        querystring = "FROM '" + local_path('hybrid_part.data') + "'"
        if os.path.isfile(file):
            for line in fileinput.FileInput(file, inplace=1):
                line = re.sub('FROM\s\'.+hybrid_part.data\'', querystring,
                              line)
                print str(re.sub('\n', '', line))

    def preprocess(self):
        """ 
        Replace the hard-coded information from sql files with correct hostname and ip address,etc 
        """

        list_workload_dir = [
            'set_sync1', 'sync1', 'set_ck_sync1', 'ck_sync1', 'set_ct', 'ct',
            'set_resync', 'resync', 'set_sync2', 'sync2'
        ]
        for dir in list_workload_dir:
            sql_path = os.path.join(local_path(dir), 'sql')
            ans_path = os.path.join(local_path(dir), 'expected')
            for file in os.listdir(sql_path):
                if (file.find('wet_ret') >= 0):
                    self.handle_ext_cases(os.path.join(sql_path, file))
                if (file.find('hybrid_part') >= 0):
                    self.handle_hybrid_part_cases(os.path.join(sql_path, file))
            for file in os.listdir(ans_path):
                if (file.find('wet_ret') >= 0):
                    self.handle_ext_cases(os.path.join(ans_path, file))
                if (file.find('hybrid_part') >= 0):
                    self.handle_hybrid_part_cases(os.path.join(ans_path, file))

    def clean_data(self):
        """ 
        Clean the data by removing the external table, otherwise, more data will be appended to the
        same external table from running multiple sql files. 
        """

        test = local_path("")
        test = str(test) + "data/*.*"

        cmd = 'rm -rfv ' + test
        run_shell_command(cmd)

    def anydownsegments(self):
        """
        checks if any segments are down
        """

        tinctest.logger.info("Checking if any segments are down")
        num_segments_down = self.count_of_nodes_down()
        if int(num_segments_down) == 0:
            return True
        else:
            return False

    def stop_start_validate(self, stopValidate=True):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """

        tinctest.logger.info("Performing stop start validate")
        tinctest.logger.info("Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i', validate=stopValidate)
        if not ok and stopValidate:
            raise Exception('Problem while shutting down the cluster')
        tinctest.logger.info("Successfully shutdown the cluster.")

        tinctest.logger.info("Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failed to bring the cluster back up')
        tinctest.logger.info("Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("segments were marked down")
        else:
            return (True, "All segments are up")

    def method_reset_fault_injection(self):
        """
        Resets fault injection
        Return: (True, [result]) if OK, or (False, [result]) otherwise
        """

        tinctest.logger.info("Resetting fault injection")

        (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')
        if not ok1:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done Injecting Fault  to reset resync")

        return (True, str(out1))

    def method_resume_filerep_resync(self):
        """
        Resumes the process of resync
        """

        tinctest.logger.info("Resuming Resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync',
                                           m='async',
                                           y='resume',
                                           r='primary',
                                           H='ALL')
        if not ok:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done resuming resync")
        return (ok, out)

    def run_method_suspendresync(self):
        """
        Stops the cluster from going to resync
        """

        tinctest.logger.info("Suspending resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync',
                                           m='async',
                                           y='suspend',
                                           r='primary',
                                           H='ALL')
        tinctest.logger.info('output from suspend resync %s' % out)
        if not ok:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done Injecting Fault to suspend resync")
        return (ok, out)

    def count_of_masters(self):
        """
        Gives count of number of nodes in the cluster that are master 
        Return: count of number of nodes in the cluster that are master
        """

        tinctest.logger.info("Count the number of masters")
        cmd = "select count(*) from gp_segment_configuration where content = -1"
        (out) = PSQL.run_sql_command(cmd)
        num_master = out.split('\n')[3].strip()
        return num_master

    def count_of_nodes(self):
        """
        Gives count of number of nodes in the cluster
        Return: count of number of nodes in the cluster
        """

        tinctest.logger.info("Counting number of nodes")
        cmd = "select count(*) from gp_segment_configuration"
        (num_cl) = PSQL.run_sql_command(cmd)
        total_num_rows = num_cl.split('\n')[3].strip()
        return total_num_rows

    def count_of_nodes_in_ct(self):
        """
        Gives count of number of nodes in change tracking
        Return: count of number of nodes in change tracking
        """

        tinctest.logger.info("Counting number of nodes in ct")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'"
        (num_cl) = PSQL.run_sql_command(sqlcmd)
        num_cl = num_cl.split('\n')[3].strip()
        return num_cl

    def count_of_nodes_down(self):
        """
        Gives count of number of nodes marked as down
        Return: count of number of nodes marked as down
        """

        tinctest.logger.info("Counting the number of nodes down")
        sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'"
        (num_down) = PSQL.run_sql_command(sqlcmd)
        num_down = num_down.split('\n')[3].strip()
        return num_down

    def count_of_nodes_sync(self):
        """
        Gives count of number of nodes in sync
        Return: count of number of nodes in sync
        """

        tinctest.logger.info("Counting the number of nodes in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def count_of_nodes_not_sync(self):
        """
        Gives count of number of nodes not in sync
        Return: count of number of nodes not in sync
        """

        tinctest.logger.info("Counting number of nodes not in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def inject_fault_on_first_primary(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        tinctest.logger.info("\n Injecting faults on first primary")
        (ok,
         out) = self.util.inject_fault(f='filerep_immediate_shutdown_request',
                                       m='async',
                                       y='infinite_loop',
                                       r='primary',
                                       seg_id=2,
                                       sleeptime=300)
        if not ok:
            raise Exception(
                "Fault filerep_immediate_shutdown_request injection failed")

        (ok, out) = self.util.inject_fault(f='fileRep_is_operation_completed',
                                           m='async',
                                           y='infinite_loop',
                                           r='primary',
                                           seg_id=2)
        if not ok:
            raise Exception(
                "Fault fileRep_is_operation_completed injection failed")
        tinctest.logger.info("\n Done Injecting Fault")

    def inject_fault_on_first_mirror(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'"
        (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd)
        first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip()

        tinctest.logger.info("\n Injecting faults on first mirror")
        flag = self.util.check_fault_status(
            fault_name='fileRep_is_operation_completed',
            status='triggered',
            max_cycle=100)
        if not flag:
            raise Exception(
                "Fault fileRep_is_operation_completed didn't trigger")

        (ok, out) = self.util.inject_fault(f='filerep_consumer',
                                           m='async',
                                           y='panic',
                                           r='mirror',
                                           seg_id=first_mirror_dbid)
        if not ok:
            raise Exception("Fault filerep_consumer injection failed")
        tinctest.logger.info("\n Done Injecting Fault")

    def setupGpfdist(self, port, path):
        gpfdist = Gpfdist(port, self.hostIP())
        gpfdist.killGpfdist()
        gpfdist.startGpfdist(' -t 30 -m 1048576 -d ' + path)
        return True

    def cleanupGpfdist(self, port, path):
        gpfdist = Gpfdist(port, self.hostIP())
        gpfdist.killGpfdist()
        return True

    def hostIP(self):
        ok = run_shell_command('which gpfdist')
        if not ok:
            raise GPtestError("Error:'which gpfdist' command failed.")
        hostname = socket.gethostname()
        if hostname.find('mdw') > 0:
            host = 'mdw'
        else:
            host = str(socket.gethostbyname(
                socket.gethostname()))  #Must be an IP
        tinctest.logger.info('current host is %s' % host)
        return host

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs = Gpfilespace()
        gpfs.create_filespace('filerep_fs_a')
        gpfs.create_filespace('filerep_fs_b')
        gpfs.create_filespace('filerep_fs_c')
        gpfs.create_filespace('filerep_fs_z')
        gpfs.create_filespace('sync1_fs_1')

        # Set max_resource_queues to 100
        cmd = 'gpconfig -c max_resource_queues -v 100 '
        ok = run_shell_command(cmd)
        if not ok:
            raise Exception(
                'Failure during setting the max_resource_queues value to 100 using gpconfig tool'
            )
        #Restart the cluster
        self.gpstop.run_gpstop_cmd(immediate='i')
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failure during restarting the cluster')
        return True

    def get_ext_table_query_from_gpstate(self):
        outfile = local_path("gpstate_tmp")
        ok = run_shell_command("gpstate --printSampleExternalTableSql >" +
                               outfile)
        querystring = ""
        flag = 'false'
        out = open(outfile, 'r').readlines()
        for line in out:
            line.strip()
            if (line.find(
                    'DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status') >=
                    0):
                flag = 'true'
            if flag == 'true':
                querystring = querystring + line
        return querystring  ############RUN QYUERY

    def check_gpstate(self, type, phase):
        """ 
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        if phase == 'sync1':
            state_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'"
            )
            sync1_num = self.query_select_count(
                "select count(*) from gp_segment_configuration where content <> -1"
            )
            if int(sync1_num) <> int(state_num):
                raise Exception("gpstate in Sync state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'ct':
            p_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking'  and role = 'Primary' and status_in_config='Up' and instance_status='Up'"
            )
            m_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync'  and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' "
            )

            if int(p_num) <> int(m_num):
                raise Exception("gpstate in CT state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'resync_incr':

            if type == 'primary':
                query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            else:
                query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)

            query_num_rows = "select count(*) from gp_segment_configuration where content <> -1"
            num_rows = self.query_select_count(query_num_rows)

            if int(resync_incr_num) <> int(num_rows):
                tinctest.logger.info("resync_incr_num query run %s" % query)
                tinctest.logger.info("num_rows query run %s" % query_num_rows)
                raise Exception(
                    "gpstate in Resync Incremental  state failed. resync_incr_num %s <> num_rows %s"
                    % (resync_incr_num, num_rows))
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'resync_full':
            num_rows = self.query_select_count(
                "select count(*) from gp_segment_configuration where content <> -1"
            )

            if type == 'primary':
                resync_full_num = self.query_select_count(
                    "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'"
                )
            else:
                resync_full_num = self.query_select_count(
                    "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'"
                )

            if int(resync_full_num) <> int(num_rows):
                raise Exception("gptate in Resync Full state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        return True

    def trigger_transition(self):
        PSQL.run_sql_file(local_path('mirrors.sql'))

    def run_gpstate(self, type, phase):
        """
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        tinctest.logger.info("running gpstate")
        querystring = self.get_ext_table_query_from_gpstate()
        file1 = local_path('create_table_gpstate.sql')
        f1 = open(file1, 'w')
        f1.write(querystring)
        f1.write('\n')
        f1.close()
        PSQL.run_sql_file(local_path('create_table_gpstate.sql'))

        gpstate_outfile = local_path('gpstate_out')
        cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile)

        ok = run_shell_command(cmd)
        self.check_gpstate(type, phase)
        return ok

    def check_mirror_seg(self, master=False):
        tinctest.logger.info("running check mirror")
        self.dbstate.check_mirrorintegrity()

    def do_gpcheckcat(self,
                      dbname=None,
                      alldb=False,
                      online=False,
                      outputFile='checkcat.out',
                      outdir=None):
        tinctest.logger.info("running gpcheckcat")
        self.dbstate.check_catalog(outputFile=outputFile)

    def query_select_count(self, sqlcmd):
        (num) = PSQL.run_sql_command(sqlcmd)
        num = num.split('\n')[3].strip()
        return num

    def method_run_failover(self, type):
        """
        Inject fault to failover nodes
        @type: primary [induces fault in mirror] mirror [creates panic in primary]   
        Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise
        """

        if type == 'primary':
            tinctest.logger.info("\n primary failover")
            (ok, out) = self.util.inject_fault(f='filerep_consumer',
                                               m='async',
                                               y='fault',
                                               r='mirror',
                                               H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")

        elif type == 'mirror':
            tinctest.logger.info("\n Mirror failover")
            (ok, out) = self.util.inject_fault(f='postmaster',
                                               m='async',
                                               y='panic',
                                               r='primary',
                                               H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")
        return True

    def wait_till_change_tracking_transition(self):
        self.util.wait_till_change_tracking_transition()

    def wait_till_insync_transition(self):
        self.gpr.wait_till_insync_transition()

    def run_gprecoverseg(self, recover_mode):
        if recover_mode == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

    def run_gpconfig(self, parameter, master_value, segment_value):
        if (parameter is not None):
            self.gpconfig.setParameter(parameter, master_value, segment_value)
            self.gpstop.run_gpstop_cmd(restart='r')

    def inject_fault(self,
                     fault=None,
                     mode=None,
                     operation=None,
                     prim_mirr=None,
                     host='All',
                     table=None,
                     database=None,
                     seg_id=None,
                     sleeptime=None,
                     occurence=None):
        if (fault == None or mode == None or operation == None
                or prim_mirr == None):
            raise Exception('Incorrect parameters provided for inject fault')

        (ok, out) = self.util.inject_fault(f=fault,
                                           m=mode,
                                           y=operation,
                                           r=prim_mirr,
                                           H='ALL',
                                           table=table,
                                           database=database,
                                           sleeptime=sleeptime,
                                           o=occurence,
                                           seg_id=seg_id)
Example #19
0
    def test_mpp23395(self):
        """
        
        @description Test MPP-20964, uncleaned lock table by pg_terminate_backend
        @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99]
        """
        self.util = Filerepe2e_Util()

        (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared")
 
        # setup
        PSQL.run_sql_command("""
          DROP TABLE IF EXISTS mpp23395;
          """)

        # Scenario 1: FAULT during Create Table on master
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1);

        # Scenario 2: FAULT during Drop Table on master, COMMIT case
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared")

        # Scenario 3: FAULT during Create Table on segment, COMMIT case
        sql = '''
        SET dtx_phase2_retry_count = 1;
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2);

        # Scenario 4: FAULT during Drop Table on segment, COMMIT case
        sql = '''
        SET dtx_phase2_retry_count = 1;
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2);

        # Scenario 5: FAULT during Create Table on master, ABORT case
        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1);
        if not ok:
           raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared")
 
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared")


        PSQL.run_sql_command("""
        CREATE TABLE mpp23395(a int);
          """)

        # Scenario 6: FAULT during Drop Table on master, ABORT case
        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1);
        if not ok:
           raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared")
 
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared")

        PSQL.run_sql_command("""
        DROP TABLE mpp23395;
          """)


        # Scenario 7: FAULT during Create Table on segment, COMMIT case, succeeds on second retry
        sql = '''
        DROP TABLE IF EXISTS mpp23395;
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'finish_prepared_after_record_commit_prepared', 'error', 2, False);

        # Scenario 8: QE panics after writing prepare xlog record.  This should
        # cause master to broadcast abort but QEs handle the abort in
        # DTX_CONTEXT_LOCAL_ONLY context.
        sql = '''
        DROP TABLE IF EXISTS mpp23395;
        CREATE TABLE mpp23395(a int);
        INSERT INTO mpp23395 VALUES(1), (2), (3);
        SET debug_abort_after_segment_prepared = true;
        DELETE FROM mpp23395;
        '''

        # No prepared transactions should remain lingering
        PSQL.run_sql_command(sql)
        self.check_no_dangling_prepared_transaction()

        dbstate = DbStateClass('run_validation')
        dbstate.check_catalog()
Example #20
0
class FtsTransitions(MPPTestCase):

    def __init__(self, methodName):
        self.pgport = os.environ.get('PGPORT')
        self.fileutil = Filerepe2e_Util()
        self.gpconfig = GPDBConfig()
        self.gprecover = GpRecover(self.gpconfig)
        self.gpstate = Gpstate()
        self.gpprimarymirror = Gpprimarymirror()
        self.base = GPDBStorageBaseTestCase(self.gpconfig)
        super(FtsTransitions,self).__init__(methodName)

    def kill_first_mirror(self):
        mirror_data_loc = self.get_default_fs_loc(role='m',content=0)
        (host, port) = self.gpconfig.get_hostandport_of_segment(psegmentNumber = 0, pRole = 'm')    
        cmdString = 'ps -ef|grep -v grep|grep \'%s\'|awk \'{print $2}\'|xargs kill -9'%mirror_data_loc
        remote = Command(name ='kill first mirror', cmdStr = cmdString, ctxt=2, remoteHost=host)
        remote.run() 
        tinctest.logger.info('run command %s'%cmdString)
        rc = remote.get_results().rc    
        result = remote.get_results().stdout
        tinctest.logger.info('Command returning, rc: %s, result: %s'%(rc,result))

    def kill_master_process(self, ProcName=None):
        cmdString = 'ps -ef|grep postgres| grep %s | grep \'%s\'| awk \'{print $2}\'|xargs kill -9'%(self.pgport,ProcName) 
        cmd = Command('kill process on master', cmdStr = cmdString)
        cmd.run()
        tinctest.logger.info('run command %s'%cmdString)
        rc = cmd.get_results().rc    
        result = cmd.get_results().stdout
        tinctest.logger.info('Command returning, rc: %s, result: %s'%(rc,result))


    def get_default_fs_loc(self, role='m', content=0):
        fs_sql = '''select fselocation from pg_filespace_entry
                    where fsefsoid = 3052 and fsedbid = (select dbid from gp_segment_configuration
                    where role = \'%s\' and content = %s);'''%(role,content)
        result = PSQL.run_sql_command(fs_sql, flags = '-q -t', dbname= 'template1')
        result = result.strip()
        filespace_loc = result.split('\n')
        return filespace_loc[0]
  
    def gpconfig_alter(self,type,bool):
        ''' Alter postgres configuration '''
        if bool == 'true':
            fault_string = "filerep_inject_listener_fault=true"
        elif bool == 'false':
            fault_string = "filerep_inject_listener_fault=false"
        for record in self.gpconfig.record:
            if type == 'primary':
                if record.role and record.content != -1:
                    fse_location = record.datadir
                else:
                    continue
            if type == 'mirror':
                if (not record.role) and record.content != -1:
                    fse_location = record.datadir
                else:
                    continue
            run_shell_command('ssh ' + record.hostname + ' \'echo '+fault_string + ' >> ' + fse_location +'/postgresql.conf\'')
            tinctest.logger.info( "\n ssh   %s   'echo %s  >>   %s/postgresql.conf'" % (record.hostname, fault_string,  fse_location))
            tinctest.logger.info( "\n  Done set %s in postgresql.conf on all primary segments" % fault_string)

    def set_faults(self,fault_name, type, role='mirror', port=None, occurence=None, sleeptime=None, seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.fileutil.inject_fault(f=fault_name, y=type, r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id)

    def resume_faults(self,fault_name, role='mirror'):
        ''' Resume the fault issues '''
        self.fileutil.inject_fault(f=fault_name, y='resume', r=role)

    def run_validation(self):
        tinctest.logger.info('Veriy the integrity between primary and mirror ...')
        self.dbstate = DbStateClass('run_validation')
        self.dbstate.check_mirrorintegrity()

    def incremental_recoverseg(self, workerPool=False):
        self.gprecover.incremental(workerPool)

    def run_recoverseg_if_ct(self):
        num_down = self.gpconfig.count_of_nodes_in_mode('c')
        if (int(num_down) > 0):
            self.incremental_recoverseg()

    def wait_till_change_tracking(self):
        self.fileutil.wait_till_change_tracking_transition()

    def wait_till_insync(self):
        self.gprecover.wait_till_insync_transition()

    def run_gpstate(self, type, phase):
        self.gpstate.run_gpstate(type, phase)

    def run_gpprimarymirror(self):
        self.gpprimarymirror.run_gpprimarymirror()

    def verify_gpprimarymirror_output(self, total_resync=0, cur_resync=0):
        status = self.gpprimarymirror.verify_gpprimarymirror_output(total_resync, cur_resync)
        self.assertTrue(status, 'Total and Cur resync object count mismatch')

    def run_gpstate_shell_cmd(self, options):
        self.gpstate.run_gpstate_shell_cmd(options)

    def verify_gpstate_output(self):
        status = self.gpstate.verify_gpstate_output()
        self.assertTrue(status, 'Total and Cur resync object count mismatch')

    def run_trigger_sql(self):
        ''' Run a sql statement to trigger postmaster reset '''
        PSQL.run_sql_file(local_path('test_ddl.sql'))

    def run_fts_test_ddl_dml(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml.sql'))

    def run_fts_test_ddl_dml_before_ct(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml_before_ct.sql'))

    def run_fts_test_ddl_dml_ct(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml_ct.sql'))

    def run_sql_in_background(self):
        PSQL.run_sql_command('drop table if exists bar; create table bar(i int);', background=True)

    def sleep_for_transition(self):
        #gp_segment_connect_timeout is set to 10s , still need a little more time than that to complete the transition to ct
        sleep(100)

    def restart_db(self):
        self.base.stop_db()
        self.base.start_db()

    def stop_db_with_no_rc_check(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()

    def start_db_with_no_rc_check(self):
        ''' Gpstart and dont check for rc '''
        cmd = Command('Gpstart_a', 'gpstart -a')
        tinctest.logger.info('Executing command: gpstart -a')
        cmd.run()

    def restart_db_with_no_rc_check(self):
        self.stop_db_with_no_rc_check()
        self.start_db_with_no_rc_check()

    def set_gpconfig(self, param, value):
        ''' Set the configuration parameter using gpconfig '''
        command = "gpconfig -c %s -v %s --skipvalidation " % (param, value)
        run_shell_command(command)
        self.restart_db()

    def check_db(self):
        checkDBUp()

    def check_fault_status(self, fault_name, seg_id=None, role=None):
        status = self.fileutil.check_fault_status(fault_name = fault_name, status ='triggered', max_cycle=20, role=role, seg_id=seg_id)
        self.assertTrue(status, 'The fault is not triggered in the time expected')

    def cluster_state(self):
        state = self.gpconfig.is_not_insync_segments()
        self.assertTrue(state,'The cluster is not up and in sync')
Example #21
0
 def test_gpcheckcat(self):
     tinctest.logger.info('Run Checkcat to verify persistent table consistency')
     dbstate = DbStateClass('run_validation')
     dbstate.check_catalog(alldb = False, dbname = Steps.dbname)
Example #22
0
class FilerepTestCase(MPPTestCase):

    def __init__(self, methodName):    
        self.pgport = os.environ.get('PGPORT')
        self.util = Filerepe2e_Util()
        self.gpconfig = GpConfig()
        self.config = GPDBConfig()
        self.gpr = GpRecover(self.config)
        self.dbstate = DbStateClass('run_validation',self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FilerepTestCase,self).__init__(methodName)

    def sleep(self, seconds=60):
        time.sleep(seconds)

    def create_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def remove_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def get_timestamp_of_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ %
                      file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)
        res = cmd.get_results().stdout.strip()
        return res

    def verify_file_exists(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def handle_ext_cases(self,file):
        """
        @file: wet sql file to replace with specific machine env.
        """

        host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP
        querystring = "gpfdist://"+host+":8088"
        
        if os.path.isfile(file):
            for line in fileinput.FileInput(file,inplace=1):
               line = re.sub('gpfdist.+8088',querystring,line)
               print str(re.sub('\n','',line))

    def handle_hybrid_part_cases(self, file):
        """
        @file: hybrid sql file to replace with specific machine env
        """

        querystring = "FROM '"+local_path('hybrid_part.data')+"'" 
        if os.path.isfile(file):
            for line in fileinput.FileInput(file,inplace=1):
                line = re.sub('FROM\s\'.+hybrid_part.data\'',querystring,line)
                print str(re.sub('\n','',line))


    def preprocess(self):
        """ 
        Replace the hard-coded information from sql files with correct hostname and ip address,etc 
        """

        list_workload_dir = ['set_sync1','sync1','set_ck_sync1','ck_sync1',
                        'set_ct','ct','set_resync','resync','set_sync2','sync2']
        for dir in list_workload_dir:
            sql_path = os.path.join(local_path(dir),'sql')
            ans_path = os.path.join(local_path(dir),'expected')
            for file in os.listdir(sql_path):
                    if (file.find('wet_ret')>=0):
                       self.handle_ext_cases(os.path.join(sql_path,file))
                    if (file.find('hybrid_part')>=0):
                       self.handle_hybrid_part_cases(os.path.join(sql_path,file))  
            for file in os.listdir(ans_path):
                    if (file.find('wet_ret')>=0):
                       self.handle_ext_cases(os.path.join(ans_path,file))
                    if (file.find('hybrid_part')>=0):
                       self.handle_hybrid_part_cases(os.path.join(ans_path,file)) 


    def clean_data(self):
        """ 
        Clean the data by removing the external table, otherwise, more data will be appended to the
        same external table from running multiple sql files. 
        """  

        test = local_path("")
        test = str(test) +"data/*.*"
    
        cmd = 'rm -rfv '+test
        run_shell_command(cmd)       

    def anydownsegments(self):
        """
        checks if any segments are down
        """        

        tinctest.logger.info("Checking if any segments are down")
        num_segments_down = self.count_of_nodes_down()
        if int(num_segments_down) == 0:
           return True
        else:
           return False

    def stop_start_validate(self, stopValidate=True):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        

        tinctest.logger.info("Performing stop start validate")
        tinctest.logger.info("Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i', validate=stopValidate)
        if not ok and stopValidate:
           raise Exception('Problem while shutting down the cluster')
        tinctest.logger.info("Successfully shutdown the cluster.")

        tinctest.logger.info("Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failed to bring the cluster back up')
        tinctest.logger.info("Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("segments were marked down")
        else:
           return (True, "All segments are up")


    def method_reset_fault_injection(self):
        """
        Resets fault injection
        Return: (True, [result]) if OK, or (False, [result]) otherwise
        """        

        tinctest.logger.info("Resetting fault injection")
        
        (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL')
        if not ok1:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done Injecting Fault  to reset resync")

        return (True, str(out1))


    def method_resume_filerep_resync(self):
        """
        Resumes the process of resync
        """

        tinctest.logger.info("Resuming Resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync', m='async',y='resume', r='primary', H='ALL')
        if not ok:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done resuming resync")
        return (ok, out)

    def run_method_suspendresync(self):
        """
        Stops the cluster from going to resync
        """

        tinctest.logger.info("Suspending resync")
        (ok,out) = self.util.inject_fault(f='filerep_resync', m='async' , y='suspend', r ='primary', H='ALL')
        tinctest.logger.info('output from suspend resync %s'%out)
        if not ok:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done Injecting Fault to suspend resync")
        return (ok, out)
      

    def count_of_masters(self):
        """
        Gives count of number of nodes in the cluster that are master 
        Return: count of number of nodes in the cluster that are master
        """

        tinctest.logger.info("Count the number of masters")
        cmd = "select count(*) from gp_segment_configuration where content = -1"
        (out) = PSQL.run_sql_command(cmd)
        num_master = out.split('\n')[3].strip()
        return num_master 


    def count_of_nodes(self):
        """
        Gives count of number of nodes in the cluster
        Return: count of number of nodes in the cluster
        """

        tinctest.logger.info("Counting number of nodes")
        cmd = "select count(*) from gp_segment_configuration"
        (num_cl) = PSQL.run_sql_command(cmd)
        total_num_rows = num_cl.split('\n')[3].strip()
        return total_num_rows


    def count_of_nodes_in_ct(self):
        """
        Gives count of number of nodes in change tracking
        Return: count of number of nodes in change tracking
        """

        tinctest.logger.info("Counting number of nodes in ct")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'"
        (num_cl) = PSQL.run_sql_command(sqlcmd)
        num_cl = num_cl.split('\n')[3].strip()
        return num_cl


    def count_of_nodes_down(self):
        """
        Gives count of number of nodes marked as down
        Return: count of number of nodes marked as down
        """

        tinctest.logger.info("Counting the number of nodes down")
        sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'"
        (num_down) = PSQL.run_sql_command(sqlcmd)
        num_down = num_down.split('\n')[3].strip()
        return num_down    


    def count_of_nodes_sync(self):
        """
        Gives count of number of nodes in sync
        Return: count of number of nodes in sync
        """

        tinctest.logger.info("Counting the number of nodes in sync")        
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync


    def count_of_nodes_not_sync(self):
        """
        Gives count of number of nodes not in sync
        Return: count of number of nodes not in sync
        """

        tinctest.logger.info("Counting number of nodes not in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def inject_fault_on_first_primary(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        tinctest.logger.info("\n Injecting faults on first primary")
        (ok,out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async' , y='infinite_loop', r ='primary', seg_id=2, sleeptime=300)
        if not ok:
            raise Exception("Fault filerep_immediate_shutdown_request injection failed")   

        (ok,out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async' , y='infinite_loop', r ='primary', seg_id=2)
        if not ok:
            raise Exception("Fault fileRep_is_operation_completed injection failed")   
        tinctest.logger.info("\n Done Injecting Fault")


    def inject_fault_on_first_mirror(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'"
        (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd)
        first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip()

        tinctest.logger.info("\n Injecting faults on first mirror")
        flag = self.util.check_fault_status(fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100);
        if not flag:
            raise Exception("Fault fileRep_is_operation_completed didn't trigger")   
 
        (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='panic', r ='mirror', seg_id=first_mirror_dbid)
        if not ok:
            raise Exception("Fault filerep_consumer injection failed")   
        tinctest.logger.info("\n Done Injecting Fault")

    def setupGpfdist(self, port, path):
        gpfdist = Gpfdist(port , self.hostIP())
        gpfdist.killGpfdist()
        gpfdist.startGpfdist(' -t 30 -m 1048576 -d '+path)
        return True

    def cleanupGpfdist(self, port,path):
        gpfdist = Gpfdist(port , self.hostIP())
        gpfdist.killGpfdist()
        return True

    def hostIP(self):
        ok = run_shell_command('which gpfdist')
        if not ok:
            raise GPtestError("Error:'which gpfdist' command failed.")
        hostname = socket.gethostname()
        if hostname.find('mdw') > 0 :
            host = 'mdw'
        else:
            host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP
        tinctest.logger.info('current host is %s'%host)
        return host

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs=Gpfilespace()
        gpfs.create_filespace('filerep_fs_a')
        gpfs.create_filespace('filerep_fs_b')
        gpfs.create_filespace('filerep_fs_c')
        gpfs.create_filespace('filerep_fs_z')
        gpfs.create_filespace('sync1_fs_1') 
 
        # Set max_resource_queues to 100 
        cmd = 'gpconfig -c max_resource_queues -v 100 '
        ok = run_shell_command(cmd)
        if not ok:
            raise Exception('Failure during setting the max_resource_queues value to 100 using gpconfig tool')
        #Restart the cluster
        self.gpstop.run_gpstop_cmd(immediate = 'i')
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failure during restarting the cluster')
        return True


    def get_ext_table_query_from_gpstate(self):
        outfile = local_path("gpstate_tmp")
        ok = run_shell_command("gpstate --printSampleExternalTableSql >"+ outfile)
        querystring = ""
        flag = 'false'
        out = open(outfile, 'r').readlines()
        for line in out:
            line.strip()
            if (line.find('DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status')>=0):
                flag = 'true'
            if flag == 'true':
                querystring = querystring + line
        return querystring ############RUN QYUERY

    def check_gpstate(self, type, phase):
        """ 
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """       

        if phase == 'sync1':
            state_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'")
            sync1_num = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1")
            if int(sync1_num) <> int(state_num):
                raise Exception("gpstate in Sync state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'ct':
            p_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking'  and role = 'Primary' and status_in_config='Up' and instance_status='Up'")
            m_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync'  and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' ")

            if int(p_num) <> int(m_num):
                raise Exception("gpstate in CT state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'resync_incr':
            
            if type == 'primary':
                query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            else:
                query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            
            query_num_rows = "select count(*) from gp_segment_configuration where content <> -1"
            num_rows = self.query_select_count(query_num_rows)
            
            if int(resync_incr_num) <> int(num_rows):
                tinctest.logger.info("resync_incr_num query run %s" % query)
                tinctest.logger.info("num_rows query run %s" % query_num_rows)
                raise Exception("gpstate in Resync Incremental  state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows))
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'resync_full':
            num_rows = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1")
          
            if type == 'primary':
                resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'")
            else:
                resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'")

            if int(resync_full_num) <> int(num_rows):
                raise Exception("gptate in Resync Full state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))
        
        return True
    
    def trigger_transition(self):
        PSQL.run_sql_file(local_path('mirrors.sql'))
        

    def run_gpstate(self, type, phase):            
        """
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        tinctest.logger.info("running gpstate")
        querystring = self.get_ext_table_query_from_gpstate()
        file1 = local_path('create_table_gpstate.sql')
        f1 = open(file1,'w')
        f1.write(querystring)
        f1.write('\n')
        f1.close()
        PSQL.run_sql_file(local_path('create_table_gpstate.sql'))

        gpstate_outfile = local_path('gpstate_out')
        cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile)

        ok  = run_shell_command(cmd)
        self.check_gpstate(type, phase)
        return ok


    def check_mirror_seg(self, master=False):
        tinctest.logger.info("running check mirror")
        self.dbstate.check_mirrorintegrity()

    def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
        tinctest.logger.info("running gpcheckcat")
        self.dbstate.check_catalog(outputFile=outputFile)

    def query_select_count(self,sqlcmd):
        (num) = PSQL.run_sql_command(sqlcmd)
        num = num.split('\n')[3].strip()
        return num
    
    def method_run_failover(self,type):
        """
        Inject fault to failover nodes
        @type: primary [induces fault in mirror] mirror [creates panic in primary]   
        Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise
        """

        if type == 'primary':
            tinctest.logger.info("\n primary failover")
            (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='fault', r ='mirror', H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")

        elif type == 'mirror':
            tinctest.logger.info("\n Mirror failover")
            (ok,out) = self.util.inject_fault(f='postmaster', m='async' , y='panic', r ='primary', H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")
        return True

    def wait_till_change_tracking_transition(self):
        self.util.wait_till_change_tracking_transition()

    def wait_till_insync_transition(self):
        self.gpr.wait_till_insync_transition()
   
    def run_gprecoverseg(self,recover_mode):
        if recover_mode == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

    def run_gpconfig(self, parameter, master_value, segment_value):
        if (parameter is not None):
            self.gpconfig.setParameter(parameter, master_value, segment_value)
            self.gpstop.run_gpstop_cmd(restart='r')

    def inject_fault(self, fault = None, mode = None, operation = None, prim_mirr = None, host = 'All', table = None, database = None, seg_id = None, sleeptime = None, occurence = None):
        if (fault == None or mode == None or operation == None or prim_mirr == None):
            raise Exception('Incorrect parameters provided for inject fault')

        (ok,out) = self.util.inject_fault(f=fault, m=mode , y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
Example #23
0
 def check_mirror_seg(self):
     dbstate = DbStateClass('run_validation')
     dbstate.check_mirrorintegrity(master=True)
Example #24
0
class SubTransactionLimitRemovalTestCase(MPPTestCase):

    def __init__(self, methodName):    
        super(SubTransactionLimitRemovalTestCase,self).__init__(methodName)
   
    def check_system(self):
        '''
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        tinctest.logger.info("[STLRTest] Running check_system")   
        
        tinctest.logger.info("[STLRTest] Check whether the system is up and sync")   
        
        cmd ="select count(*) from gp_segment_configuration where content<> -1 ;"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_all = num_cl.split('\n')[3].strip()
               
        cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_up_and_sync = num_cl.split('\n')[3].strip()
        tinctest.logger.info("[STLRTest] printing gp segment configuration")
        (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid")
        tinctest.logger.info(gp_seg_conf)


        if count_all != count_up_and_sync :
            raise Exception("[STLRTest] System not in sync and up. Exiting test")
        else:
            tinctest.logger.info("[STLRTest] Starting New Test: System is up and in sync...")

    def run_sqls(self,test):
        '''
        @summary : Run the sql 
        @param test: the sql file list
        '''        
        tinctest.logger.info("[STLRTest] Running run_sqls")   
        tinctest.logger.info("[STLRTest]Starting new thread to run sql %s"%(test))
        PSQL.run_sql_file(local_path(test))
            
    def suspend_faults(self,fault_name):
        '''
        @summary : Suspend the specified fault: reset it before issuing suspend 
        @param fault_name : Name of the fault to suspend
        '''
        tinctest.logger.info("[STLRTest] Running suspend_faults")   

        self.util = Filerepe2e_Util()

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting the %s fault"%(fault_name))      

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'suspend', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done suspending the %s fault"%(fault_name))
        
    def check_fault_status(self,fault_name = None, status = None, max_cycle=10):
        ''' 
        Check whether a fault is triggered. Poll till the fault is triggered
        @param name : Fault name
        @param status : Status to be checked - triggered/completed
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running check_fault_status %s", status)   

        if (not fault_name) or (not status) :
            raise Exception("[STLRTest]Need a value for fault_name and status to continue")

        poll =0
        while(poll < max_cycle):
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'status', r = 'primary', H ='ALL')
            poll +=1
            for line in out1.splitlines():
                if line.find(fault_name) > 0 and line.find(status) > 0 :
                    tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name,status))
                    poll = 0
                    tinctest.logger.info("[STLRTest] Running check_fault_status %s TRUE", status)
                    return True

            #sleep a while before start polling again
            sleep(10)
        tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status)
        return False
        
        
    def filerep_fault(self,trans_state):
        '''
        @summary : Inject the filerep fault supplied
        @param trans_state : type of transition 
        '''
        tinctest.logger.info("[STLRTest] Running filerep_fault")   
        self.util = Filerepe2e_Util()

        if trans_state == 'failover_to_primary':
            tinctest.logger.info("[STLRTest] primary failover")
            (ok1,out1) = self.util.inject_fault(f='filerep_consumer', m = 'async', y = 'fault', r = 'mirror', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done primary failover fault")

        elif trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for postmaster panic")
            (ok1,out1) = self.util.inject_fault(f='postmaster', m = 'async', y = 'panic', r = 'primary', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done postmaster panic fault")

        elif trans_state == 'postmaster_reset':
            tinctest.logger.info("[STLRTest] fault for filerep_sender panic")
            (ok1,out1) = self.util.inject_fault(f='filerep_sender', m = 'async', y = 'panic', r = 'primary', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done filerep_sender panic fault")
            
        tinctest.logger.info("[STLRTest] Done Injecting Fault")

    def resume_faults(self,fault_name,trans_state):
        ''''
        @summary : Resume the fault and check status
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_faults")   

        if not trans_state == 'failover_to_mirror' :
            tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name)
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'primary', H ='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault resume failed")   
            tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name)

        if trans_state == 'postmaster_reset':
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'mirror', H ='ALL')
            if not ok1:
                tinctest.logger.info("[STLRTest]Failed fault for %s resume on mirror" % fault_name)

        if trans_state == 'failover_to_primary' :
            self.check_fault_status(fault_name,'completed')

    def checkPSQLRun(self, test):
        '''Check if the psql run started in parallel is over before running the _post.sql '''
        tinctest.logger.info("[STLRTest] Running checkPSQLRun")   
        cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql'
        while(1):
            is_running = 0 
            (rc , out) = shell.run(cmd_str)
            for line in out:
                if '%s' %test in line:
                    is_running = 1 
            if is_running == 0:
                return True
            else:
                sleep(10)
        return False
        

    def resume_filerep_resync(self):
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_filerep_resync")   

        tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume")
        (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'resume', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done fault for failover_to_mirror resume")
        sleep(10)

    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")   

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        if not expect_down_segments:
            if not ok:
                raise Exception('[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info("[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("[STLRTest]segments were marked down")
        else:
           return (True, "All segments are up")

    def run_gprecoverseg(self,recover_option):
        '''
        @summary : Call gpecoverseg full or incremental to bring back the cluster to sync
        '''
        self.gpr = GpRecover()

        tinctest.logger.info("[STLRTest] Running run_gprecoverseg")   

        if recover_option == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

        self.gpr.wait_till_insync_transition()
        
    def run_restart_database(self):
        '''
        @summary : Restart the database
        '''
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running run_restart_database")   
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        tinctest.logger.info(ok)
        ok = self.gpstart.run_gpstart_cmd()
        tinctest.logger.info(ok)       
       
    def reset_faults(self,fault_name,current_cluster_state):
        ''''
        @summary : Reset the faults at the end of test 
        '''
        self.util = Filerepe2e_Util()
        tinctest.logger.info("[STLRTest] Running reset_faults")   

        tinctest.logger.info("[STLRTest] Resetting fault before ending test")

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting %s fault" %(fault_name))

        if current_cluster_state == 'resync':
            (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done filerep_resync fault")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting checkpoint fault" )
        
    def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
        self.dbstate.check_catalog()
        return True

    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''
        
        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation
 
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)
         
        self.dbstate.check_mirrorintegrity()

    def inject_and_resume_fault(self, fault_name, trans_state):
        self.check_fault_status(fault_name, 'triggered')
        self.filerep_fault(trans_state)
        if trans_state == 'failover_to_mirror' :
            PSQL.run_sql_file(local_path('test_while_ct.sql'))
        self.resume_faults(fault_name, trans_state)

    def run_post_sqls(self, fault_name ='', trans_state=''):
        PSQL.wait_for_database_up();
        if (trans_state == 'failover_to_primary' or trans_state == ''):   
            post_sql = "failover_sql/subt_create_table_ao_post_commit"
        else:
            post_sql = "failover_sql/subt_create_table_ao_post_abort"       
            
        sql_file = post_sql+".sql"
        ans_file = post_sql+".ans"
        out_file = post_sql+".out"

        PSQL.run_sql_file(sql_file = local_path(sql_file), out_file = local_path(out_file))
        diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file))
        
        if not diff_res:
           self.fail("[STLRTest]Gpdiff failed for : %s %s" %(fault_name, trans_state))

    def reset_all_faults(self):
        ''''
        @summary : Reset all faults on primary and mirror 
        '''
        tinctest.logger.info("[STLRTest] Running reset_all_faults")   
        self.util = Filerepe2e_Util()

        (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting all faults on primary")

        (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'mirror', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting all faults fault on mirror") 

    def kill_zombies(self):
        ''' 
        @summary : There are stray zombie processes running after each test. This method clears them 
        '''
        tinctest.logger.info("[STLRTest] Running kill_zombies")
        cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1"
        cmd = Command("shell_command", cmd_str)
        tinctest.logger.info('Executing command: %s : %s' %("shell command", cmd_str))
        cmd.run()
        result = cmd.get_results()
        out = result.stdout
        lines = out.split('\n')
        for line in lines:
            pids = line.split('#')
            if pids[0] == '1':
               kill_str= "kill -9 %s" %(pids[1])
               cmd2 = Command("kill_command", kill_str)
               cmd2.run()


    def skip_checkpoint(self):
        ''' 
        @summary : Routine to inject fault that skips checkpointing 
        '''

        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running skip_checkpoint")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'skip', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault")

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs=Gpfilespace()
        gpfs.create_filespace('subt_filespace_a')

    def cleandb(self):
        db = Database()
        db.setupDatabase('gptest')
Example #25
0
class AOCOAlterColumn(MPPTestCase):
    
    def __init__(self):
        self.fileutil = Filerepe2e_Util()
        self.gprecover = GpRecover()
        self.config = GpConfig()
        self.base_dir = os.path.dirname(sys.modules[self.__class__.__module__].__file__)


    def get_sql_files(self, sql_file_name):
        sql_file = os.path.join( self.base_dir, "sql", sql_file_name + ".sql");    
        return  sql_file

    def validate_sql(self, ans_file, out_file):
        ''' Compare the out and ans files '''
        init_file=os.path.join( self.base_dir, "sql",'init_file')
        result1 = Gpdiff.are_files_equal(out_file, ans_file, match_sub =[init_file])
        self.assertTrue(result1 ,'Gpdiff.are_files_equal')        

    def run_sql(self, filename, out_file,background=False):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(filename,out_file=out_file,background=background)


    def run_test_CatalogCheck(self, action,storage):
        file_name =action+'_'+storage
        sql_file = self.get_sql_files(file_name)
        out_file = self.base_dir+ "/sql/"+file_name+'.out'
        tinctest.logger.info( 'sql-file == %s \n' % sql_file)
        tinctest.logger.info( 'out-file == %s \n' % out_file)
        # Run Add/Drop Column script
        self.run_sql(sql_file, out_file=out_file)

    def validate_test_CatalogCheck(self, action,storage):
        file_name =action+'_'+storage
        out_file = self.base_dir+ "/sql/"+file_name+'.out'
        ans_file = self.base_dir+ "/expected/"+file_name+'.ans'
        tinctest.logger.info( 'out-file == %s \n' % out_file)
        tinctest.logger.info( 'ans-file == %s \n' % ans_file)
        # Validate Ans file
        self.validate_sql(ans_file,out_file)
        if storage == 'multisegfiles':
            ''' check if multi_segfile_tab file has  multiple segfiles per column '''
            tablename='multi_segfile_tab'
            relid = self.get_relid(file_name=tablename )
            utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid)
            u_port=utilitymodeinfo[0]
            u_host=utilitymodeinfo[1]
            assert(1 < int(self.get_segment_cnt(relid=relid,host=u_host,port= u_port)))
        # Check Correctness of the catalog
        self.dbstate = DbStateClass('run_validation')
        outfile = local_path("gpcheckcat_"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')+".out")
        self.dbstate.check_catalog(outputFile=outfile)

    def run_test_ChangeTracking(self,filename):
        # Log the segment state before starting the test
        # Expectation is a SYNC state
        self.log_segment_state()
        primary_dbid=self.get_dbid()
        # Run the 'alter table add column cmmand' in the background
        self.run_sql_ChangeTracking(filename,stage='fail',validate=False,background=True)
        # Inject Fault to put one primary in panic
        self.fileutil.inject_fault(f='postmaster', y='reset',  seg_id=primary_dbid)
        self.fileutil.inject_fault(f='postmaster', y='panic',  seg_id=primary_dbid)
        state=self.fileutil.check_fault_status(fault_name='postmaster', status='triggered')
        self.log_segment_state()
        # Recover the down segments
        self.recover_seg()
        self.log_segment_state()
        # Validate that the previous alter failed because primary segment went down as the alter was taking place
        self.run_sql_ChangeTracking(filename,stage='failvalidate',validate=True,background=False) 
        # Now the system is in change tracking so the next alter should pass
        self.run_sql_ChangeTracking(filename,stage='pass',validate=True,background=False) 
        self.log_segment_state()


    def recover_seg(self):
        result=self.get_segcount_state(state='d')
        if result > 0:
            if not self.gprecover.incremental():
                raise Exception('Gprecoverseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
        tinctest.logger.info('Segments recovered and back in sync')
        

    def run_sql_ChangeTracking(self,filename,stage,validate=False,background=False):
        fname=filename+'-'+stage
        sql_file = self.get_sql_files(fname)
        out_file = self.base_dir+ "/sql/"+fname +'.out'
        ans_file = self.base_dir+ "/expected/"+fname+'.ans'
        tinctest.logger.info( '\n==============stage = %s ================' % (stage))
        tinctest.logger.info( sql_file)
        tinctest.logger.info( out_file)
        tinctest.logger.info( ans_file)
        tinctest.logger.info( '==============================')
        result=self.run_sql(sql_file,out_file=out_file,background=background)
        if validate == True:
           self.validate_sql(ans_file,out_file)

 
    def get_dbid(self):
        sql_cmd = "select min(dbid) dbid from gp_segment_configuration where role = 'p' and status = 'u' and content > -1"
        dbid=PSQL.run_sql_command(sql_cmd= sql_cmd,flags='-q -t')
        tinctest.logger.info('Segments %s chosen for fault injection' % (dbid))
        return dbid
     
    def log_segment_state(self):
        sql_cmd = "select * from gp_segment_configuration order by dbid"
        result=PSQL.run_sql_command(sql_cmd= sql_cmd)
        tinctest.logger.info('==========================')
        tinctest.logger.info('State of Segments ')
        tinctest.logger.info(result)
        tinctest.logger.info('==========================')

    def get_segcount_state(self,state):
        sql_cmd = "select count(*) from gp_segment_configuration where status = '%s'" % (state)
        result=PSQL.run_sql_command(sql_cmd= sql_cmd,flags='-q -t')
        tinctest.logger.info('Number of segments in %s State == %d' % (state,(int(result))))
        return int(result)

    def get_utilitymode_conn_info(self, relid=0):
        #get the segment_id where to log in utility mode and then get the hostname and port for this segment
        sql_cmd="select port, hostname from gp_segment_configuration sc  where dbid > 1 and role = 'p' limit 1;"
        utilitymodeinfo=PSQL.run_sql_command(sql_cmd=sql_cmd,  flags='-q -t')
        u_port=utilitymodeinfo.strip().split('|')[0]
        u_host=utilitymodeinfo.strip().split('|')[1]
        return [u_port,u_host]

    def get_relid(self,file_name=None):
        sql_cmd="SELECT oid FROM pg_class WHERE relname='%s';\n" % file_name
        relid= PSQL.run_sql_command(sql_cmd=sql_cmd,  flags='-q -t')
        return relid;

    def get_segment_cnt(self, relid=0,host=None,port=None):
        sql_cmd="select count(*) from gp_toolkit.__gp_aocsseg(%s) group by column_num having count(*) > 1 limit 1" % (relid)
        segcnt=PSQL.run_sql_command_utility_mode(sql_cmd=sql_cmd,host=host, port=port,flags='-q -t')
        if (len(segcnt.strip()) == 0):
            segcnt='0'
        return segcnt

    def run_test_utility_mode(self,filename):
        #alter_aoco_tab_utilitymode
        relid = self.get_relid(file_name=filename )
        utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid)
        u_port=utilitymodeinfo[0]
        u_host=utilitymodeinfo[1]
        self.run_sql_utility_mode(filename,host=u_host,port=u_port)

    
    def run_sql_utility_mode(self,filename,host=None,port=None):
        fname=filename
        sql_file = self.get_sql_files(fname)
        out_file = self.base_dir+ "/sql/"+fname +'.out'
        ans_file = self.base_dir+ "/expected/"+fname+'.ans'
        tinctest.logger.info( '\n==============================')
        tinctest.logger.info( sql_file)
        tinctest.logger.info( out_file)
        tinctest.logger.info( ans_file)
        tinctest.logger.info( '==============================')
        result=PSQL.run_sql_file_utility_mode(sql_file,out_file=out_file,host=host, port=port)
        self.validate_sql(ans_file,out_file)
Example #26
0
 def test_gpcheckcat(self):
     dbstate = DbStateClass('run_validation')
     dbstate.check_catalog(alldb=False, dbname=Verification.dbname)
Example #27
0
    def test_mpp23395(self):
        """
        
        @description Test MPP-20964, uncleaned lock table by pg_terminate_backend
        @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99]
        """
        self.util = Filerepe2e_Util()

        (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared")
 
        # setup
        PSQL.run_sql_command("""
          DROP TABLE IF EXISTS mpp23395;
          """)

        # Scenario 1: FAULT during Create Table on master
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1);

        # Scenario 2: FAULT during Drop Table on master, COMMIT case
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault dtm_broadcast_commit_prepared")

        # Scenario 3: FAULT during Create Table on segment, COMMIT case
        sql = '''
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2);

        # Scenario 4: FAULT during Drop Table on segment, COMMIT case
        sql = '''
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error', 2);

        # Scenario 5: FAULT during Create Table on master, ABORT case
        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1);
        if not ok:
           raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared")
 
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared")


        PSQL.run_sql_command("""
        CREATE TABLE mpp23395(a int);
          """)

        # Scenario 6: FAULT during Drop Table on master, ABORT case
        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', seg_id=1);
        if not ok:
           raise Exception("Failed to set the error fault for transaction_abort_after_distributed_prepared")
 
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1);

        (ok,out) = self.util.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', seg_id=1);
        if not ok:
           raise Exception("Failed to reset the fault transaction_abort_after_distributed_prepared")

        PSQL.run_sql_command("""
        DROP TABLE mpp23395;
          """)

        dbstate = DbStateClass('run_validation')
        dbstate.check_catalog()
Example #28
0
 def check_mirror_seg(self, master=False):
     tinctest.logger.info("running check mirror")
     dbstate = DbStateClass('run_validation')
     dbstate.check_mirrorintegrity(master=master)
Example #29
0
class FtsTransitions(MPPTestCase):
    def __init__(self, methodName):
        self.pgport = os.environ.get('PGPORT')
        self.fileutil = Filerepe2e_Util()
        self.gpconfig = GPDBConfig()
        self.gprecover = GpRecover(self.gpconfig)
        self.gpstate = Gpstate()
        self.gpprimarymirror = Gpprimarymirror()
        self.base = GPDBStorageBaseTestCase(self.gpconfig)
        super(FtsTransitions, self).__init__(methodName)

    def kill_first_mirror(self):
        mirror_data_loc = self.get_default_fs_loc(role='m', content=0)
        (host,
         port) = self.gpconfig.get_hostandport_of_segment(psegmentNumber=0,
                                                          pRole='m')
        cmdString = 'ps -ef|grep -v grep|grep \'%s\'|awk \'{print $2}\'|xargs kill -9' % mirror_data_loc
        remote = Command(name='kill first mirror',
                         cmdStr=cmdString,
                         ctxt=2,
                         remoteHost=host)
        remote.run()
        tinctest.logger.info('run command %s' % cmdString)
        rc = remote.get_results().rc
        result = remote.get_results().stdout
        tinctest.logger.info('Command returning, rc: %s, result: %s' %
                             (rc, result))

    def kill_master_process(self, ProcName=None):
        cmdString = 'ps -ef|grep postgres| grep %s | grep \'%s\'| awk \'{print $2}\'|xargs kill -9' % (
            self.pgport, ProcName)
        cmd = Command('kill process on master', cmdStr=cmdString)
        cmd.run()
        tinctest.logger.info('run command %s' % cmdString)
        rc = cmd.get_results().rc
        result = cmd.get_results().stdout
        tinctest.logger.info('Command returning, rc: %s, result: %s' %
                             (rc, result))

    def get_default_fs_loc(self, role='m', content=0):
        fs_sql = '''select fselocation from pg_filespace_entry
                    where fsefsoid = 3052 and fsedbid = (select dbid from gp_segment_configuration
                    where role = \'%s\' and content = %s);''' % (role, content)
        result = PSQL.run_sql_command(fs_sql,
                                      flags='-q -t',
                                      dbname='template1')
        result = result.strip()
        filespace_loc = result.split('\n')
        return filespace_loc[0]

    def gpconfig_alter(self, type, bool):
        ''' Alter postgres configuration '''
        if bool == 'true':
            fault_string = "filerep_inject_listener_fault=true"
        elif bool == 'false':
            fault_string = "filerep_inject_listener_fault=false"
        for record in self.gpconfig.record:
            if type == 'primary':
                if record.role and record.content != -1:
                    fse_location = record.datadir
                else:
                    continue
            if type == 'mirror':
                if (not record.role) and record.content != -1:
                    fse_location = record.datadir
                else:
                    continue
            run_shell_command('ssh ' + record.hostname + ' \'echo ' +
                              fault_string + ' >> ' + fse_location +
                              '/postgresql.conf\'')
            tinctest.logger.info(
                "\n ssh   %s   'echo %s  >>   %s/postgresql.conf'" %
                (record.hostname, fault_string, fse_location))
            tinctest.logger.info(
                "\n  Done set %s in postgresql.conf on all primary segments" %
                fault_string)

    def set_faults(self,
                   fault_name,
                   type,
                   role='mirror',
                   port=None,
                   occurence=None,
                   sleeptime=None,
                   seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.fileutil.inject_fault(f=fault_name,
                                   y=type,
                                   r=role,
                                   p=port,
                                   o=occurence,
                                   sleeptime=sleeptime,
                                   seg_id=seg_id)

    def resume_faults(self, fault_name, role='mirror'):
        ''' Resume the fault issues '''
        self.fileutil.inject_fault(f=fault_name, y='resume', r=role)

    def run_validation(self):
        tinctest.logger.info(
            'Veriy the integrity between primary and mirror ...')
        self.dbstate = DbStateClass('run_validation')
        self.dbstate.check_mirrorintegrity()

    def incremental_recoverseg(self, workerPool=False):
        self.gprecover.incremental(workerPool)

    def run_recoverseg_if_ct(self):
        num_down = self.gpconfig.count_of_nodes_in_mode('c')
        if (int(num_down) > 0):
            self.incremental_recoverseg()

    def wait_till_change_tracking(self):
        self.fileutil.wait_till_change_tracking_transition()

    def wait_till_insync(self):
        self.gprecover.wait_till_insync_transition()

    def run_gpstate(self, type, phase):
        self.gpstate.run_gpstate(type, phase)

    def run_gpprimarymirror(self):
        self.gpprimarymirror.run_gpprimarymirror()

    def verify_gpprimarymirror_output(self, total_resync=0, cur_resync=0):
        status = self.gpprimarymirror.verify_gpprimarymirror_output(
            total_resync, cur_resync)
        self.assertTrue(status, 'Total and Cur resync object count mismatch')

    def run_gpstate_shell_cmd(self, options):
        self.gpstate.run_gpstate_shell_cmd(options)

    def verify_gpstate_output(self):
        status = self.gpstate.verify_gpstate_output()
        self.assertTrue(status, 'Total and Cur resync object count mismatch')

    def run_trigger_sql(self):
        ''' Run a sql statement to trigger postmaster reset '''
        PSQL.run_sql_file(local_path('test_ddl.sql'))

    def run_fts_test_ddl_dml(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml.sql'))

    def run_fts_test_ddl_dml_before_ct(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml_before_ct.sql'))

    def run_fts_test_ddl_dml_ct(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml_ct.sql'))

    def run_sql_in_background(self):
        PSQL.run_sql_command(
            'drop table if exists bar; create table bar(i int);',
            background=True)

    def sleep_for_transition(self):
        #gp_segment_connect_timeout is set to 10s , still need a little more time than that to complete the transition to ct
        sleep(100)

    def restart_db(self):
        self.base.stop_db()
        self.base.start_db()

    def stop_db_with_no_rc_check(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()

    def start_db_with_no_rc_check(self):
        ''' Gpstart and dont check for rc '''
        cmd = Command('Gpstart_a', 'gpstart -a')
        tinctest.logger.info('Executing command: gpstart -a')
        cmd.run()

    def restart_db_with_no_rc_check(self):
        self.stop_db_with_no_rc_check()
        self.start_db_with_no_rc_check()

    def set_gpconfig(self, param, value):
        ''' Set the configuration parameter using gpconfig '''
        command = "gpconfig -c %s -v %s --skipvalidation " % (param, value)
        run_shell_command(command)
        self.restart_db()

    def check_db(self):
        checkDBUp()

    def check_fault_status(self, fault_name, seg_id=None, role=None):
        status = self.fileutil.check_fault_status(fault_name=fault_name,
                                                  status='triggered',
                                                  max_cycle=20,
                                                  role=role,
                                                  seg_id=seg_id)
        self.assertTrue(status,
                        'The fault is not triggered in the time expected')

    def cluster_state(self):
        state = self.gpconfig.is_not_insync_segments()
        self.assertTrue(state, 'The cluster is not up and in sync')
Example #30
0
class SuspendCheckpointCrashRecovery(MPPTestCase):
    
    def __init__(self,methodName):
        self.fileutil = Filerepe2e_Util()
        self.config = GPDBConfig()
        self.gprecover = GpRecover(self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        self.gpfile = Gpfilespace(self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')
        self.base = GPDBStorageBaseTestCase()
        super(SuspendCheckpointCrashRecovery,self).__init__(methodName)

    def check_system(self):
        ''' 
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        cmd ="select count(*) from gp_segment_configuration where content<> -1 ;"
        count_all = PSQL.run_sql_command(cmd, flags ='-q -t', dbname='postgres')
        cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        count_up_and_sync = PSQL.run_sql_command(cmd, flags ='-q -t', dbname='postgres')
        if count_all.strip() != count_up_and_sync.strip() :
            os._exit(1)
        else:
            tinctest.logger.info("\n Starting New Test: System is up and in sync .........")

    def get_items_list(self, tests):
        ''' Get file contents to a list '''
        test_file = local_path(tests)
        with open(test_file, 'r') as f:
            test_list = [line.strip() for line in f]
        return test_list

    def checkPSQLRun(self, test):
        '''Check if the psql run started in background is over before running the _post.sql '''
        cmd_str = "ps -ef|grep '%s'|grep [p]sql" % test
        while(1):
            is_running = 0 
            cmd = Command('Check psql run', cmd_str)
            cmd.run()
            result = cmd.get_results()
            for line in result.stdout.splitlines():
                if '%s' %test in line:
                    tinctest.logger.info(line)
                    is_running = 1 
            if is_running == 0:
                return True
            else:
                sleep(5)
        return False

    def modify_sql_file(self, filename):
        ans_file = local_path(filename.replace('.sql' , '.ans'))
        for sfile in (filename, ans_file):
            for line in fileinput.FileInput(sfile,inplace=1):
                line = re.sub('gptest', os.getenv('PGDATABASE'), line)
                print str(re.sub('\n','',line))

    def validate_sql(self, filename):
        ''' Compare the out and ans files '''
        out_file = local_path(filename.replace(".sql", ".out"))
        ans_file = local_path(filename.replace('.sql' , '.ans'))
        assert Gpdiff.are_files_equal(out_file, ans_file)

    def run_sql(self, filename):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(sql_file = filename, out_file = out_file)
        self.validate_sql(filename)

    def set_faults_before_executing_pre_sqls(self, cluster_state):
        ''' Set the checkpoint skip fault '''
        if cluster_state == 'change_tracking':
           self.cluster_in_change_tracking()
        self.fileutil.inject_fault(f='checkpoint', y='reset', r='primary', p=self.port)
        self.fileutil.inject_fault(f='checkpoint', y='skip', r='primary', p=self.port, o='0')
        tinctest.logger.info('Successfully injected fault to skip checkpointing') 
        if(cluster_state == 'resync'):
            self.fileutil.inject_fault(f='filerep_consumer', y='reset')
            self.fileutil.inject_fault(f='filerep_consumer', y='fault')
            self.fileutil.wait_till_change_tracking_transition()

    def suspend_fault(self, fault_name):
        ''' Suspend the provided fault_name '''
        self.fileutil.inject_fault(f='%s' % fault_name, y='reset', o='0', r='primary', p=self.port)
        self.fileutil.inject_fault(f='%s' % fault_name, y='suspend', o='0', r='primary', p=self.port)
        tinctest.logger.info('Successfully injected fault to suspend %s' % fault_name)

    def get_faults_before_executing_trigger_sqls(self, pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False):
        ''' Get the fault before trigger sqls are executed '''
        fault_name=''
        tinctest.logger.info('Fault Conditions: pass_num = [%s], cluster_state = [%s], test_type =  [%s], ddl_type = [%s], aborting_create_needed = [%s]' % (pass_num, cluster_state, test_type, ddl_type, aborting_create_needed)) 

        if pass_num == 1 and test_type == 'commit' and ddl_type == 'create':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_commit_pass1_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_commit_pass1_from_create_pending_to_created'
                
        elif pass_num == 2 and test_type == 'commit' and ddl_type == 'create':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_commit_pass2_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_commit_pass2_from_create_pending_to_created'

        elif pass_num == 1 and test_type == 'commit' and ddl_type == 'drop':
            fault_name = 'finish_prepared_transaction_commit_pass1_from_drop_in_memory_to_drop_pending'

        elif pass_num == 2 and test_type == 'commit' and ddl_type == 'drop':
            fault_name = 'finish_prepared_transaction_commit_pass2_from_drop_in_memory_to_drop_pending'

        elif pass_num == 1 and test_type == 'abort':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_abort_pass1_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_abort_pass1_from_create_pending_to_aborting_create'

        elif pass_num == 2 and test_type == 'abort':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_abort_pass2_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_abort_pass2_from_create_pending_to_aborting_create'

        elif pass_num == 0 and (test_type == 'abort' or test_type == 'commit'):
            pass # We already set the fault error_txn_abort_after_dist_prepare_on_master above for abort tests and for commit tests skip checkpoint is done by default for all tests.
        return fault_name

    def set_faults_before_executing_trigger_sqls(self, pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False):
        ''' Set the fault before trigger sqls are executed '''
        if (cluster_state == 'resync'):
            self.cluster_in_resync()
        fault_name=''
        fault_name = self.get_faults_before_executing_trigger_sqls(pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False);

        if (test_type == 'abort'):
            self.fileutil.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', p=self.port, o='0', seg_id=1)
            self.fileutil.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', p=self.port, o='0', seg_id=1)
            tinctest.logger.info('Successfully injected fault to error out after distributed prepare for abort tests')

        if pass_num !=0 :
            self.suspend_fault(fault_name)
        elif pass_num == 0 : 
            fault_name = None
        if (cluster_state == 'resync'):
            self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y = 'reset', r = 'primary')
            self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y = 'suspend', r = 'primary')
            tinctest.logger.info('Successfully suspended filerep_transition_to_sync_begin')
            #Resume resync so that trigger sql can execute while resync is in progress
            self.fileutil.inject_fault(f='filerep_resync', y = 'resume', r = 'primary')
        return fault_name

    def cluster_in_resync(self):
        '''
        1. Suspend filerep_resync, 2. Suspend filerep_transition_to_sync_before_checkpoint, 3. Run gprecoverseg
        '''
        self.base.invoke_fault('filerep_resync', 'suspend', role='primary')
        self.base.invoke_fault('filerep_transition_to_sync_before_checkpoint', 'suspend', role='primary', port=self.port , occurence='0')
        rc = self.gprecover.incremental()
        if not rc:
            raise Exception('Gprecvoerseg failed')
        tinctest.logger.info('Cluster in resync state')

    def switch_primary_mirror_role_in_utility_mode(self):
        '''Utility routine to start the master, connect in utility mode, switch the roles of primary and mirrors and shutdown the master '''
        cmd = Command('Start master in utility mode', 'export GPSTART_INTERNAL_MASTER_ONLY=1;gpstart -m')
        cmd.run(validateAfter=True)
        result = cmd.get_results()
        if result.rc != 0:
            raise Exception('Unable to start master in utility mode')
        tinctest.logger.info('Started master in utility mode')
    
        sql_cmd_list = ["update gp_segment_configuration set role='t' where role ='p' and content <> -1", "update gp_segment_configuration set role='p',mode='c' where role ='m' and content <> -1", "update gp_segment_configuration set role='m',status='d' where role ='t' and content <> -1"]
        for sql_cmd in sql_cmd_list:
            PSQL.run_sql_command(sql_cmd, PGOPTIONS="-c gp_session_role=utility -c allow_system_table_mods=dml")
        tinctest.logger.info('Updated the catalog to reverse the roles')
        rc = self.gpstop.run_gpstop_cmd(masteronly = True)
        if not rc:
            raise Exception('Failure to shut down the master')

    def stop_db(self):
        ''' gpstop immediate'''
        rc = self.gpstop.run_gpstop_cmd(immediate = True)
        if not rc:
            raise Exception('Failed to stop the cluster')
        tinctest.logger.info('Stopped cluster immediately')
    
    def start_db(self, down_segments=False):
        ''' Gpstart -a '''
        rc = self.gpstart.run_gpstart_cmd()
        if not rc:
            raise Exception('Failed to start the cluster')
        tinctest.logger.info('Started the cluster successfully')
       
        if not down_segments:
            if self.config.is_down_segments():
                raise Exception('Segments got marked down')

    ''' This is sleep free version based on fault triggered status '''
    def run_crash_and_recovery_fast(self,test_dir, pass_num, cluster_state, test_type, ddl_type, aborting_create_needed=False):
        if pass_num == 0:
            self.wait_till_all_sqls_done()
        else:
            mydir=local_path(test_dir)+'/trigger_sql/sql/'
            tinctest.logger.info('mydir = %s ' % mydir)
            trigger_count = len(glob.glob1(mydir,"*trigger.sql"))
            tinctest.logger.info('*** Count of trigger : %s *** ' % (trigger_count))
            if test_dir == "abort_create_tests":
               ''' vacuum full sql don't hit the suspend fault.'''
               trigger_count = trigger_count - 1
            if test_dir == "abort_create_needed_tests":
                ''' Not all SQLs hit the fault for this case, hence wait for them to complete and then others to hit the fault'''
                self.wait_till_all_sqls_done(8 + 1)
                trigger_count = 8
            if test_dir == "abort_abort_create_needed_tests":
                ''' Not all SQLs hit the fault for this case, hence wait for them to complete and then others to hit the fault'''
                self.wait_till_all_sqls_done(6 + 1)
                trigger_count = 6
            fault_type = self.get_faults_before_executing_trigger_sqls(pass_num, cluster_state, test_type, ddl_type, aborting_create_needed=False)
            fault_hit = self.fileutil.check_fault_status(fault_name=fault_type, status="triggered", num_times_hit=trigger_count)
            if not fault_hit:
               raise Exception('Fault not hit expected number of times')

        self.stop_start_validate(cluster_state)

    def wait_till_all_sqls_done(self, count=1):
        ''' 500 here is just an arbitrarily long time "if-we-exceed-this-then-oh-crap-lets-error-out" value '''
        for i in range(1,500):
            psql_count = PSQL.run_sql_command("select count(*) from pg_stat_activity where current_query <> '<IDLE>'", flags='-q -t', dbname='postgres')
            if int(psql_count.strip()) <= count :
                return
            sleep(1)
        raise Exception('SQLs expected to complete but are still running')

    def stop_start_validate(self, cluster_state):
        ''' Do gpstop immediate, gpstart and see if all segments come back up fine '''
        if cluster_state == 'sync' :
            self.stop_db()
            self.switch_primary_mirror_role_in_utility_mode()
            tinctest.logger.info('Successfully switched roles of primary and mirrors in gp_segment_configuration')
            self.start_db(down_segments=True)
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecoverseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
        if cluster_state == 'change_tracking':
            self.stop_db()
            self.start_db(down_segments=True)

        if cluster_state == 'resync':
            #Resume the filerep_resync filerep_transition_to_sync_begin before stop-start
            self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y='resume', r='primary')
            self.stop_db()
            self.start_db()
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
        self.dbstate.check_catalog(alldb=False)

    def cluster_in_change_tracking(self):
        '''
        Put Cluster into change_tracking
        '''
        self.base.invoke_fault('filerep_consumer', 'fault', role='primary')
        self.fileutil.wait_till_change_tracking_transition()
        tinctest.logger.info('Change_tracking transition complete')


    def validate_system(self, cluster_state):
        # Validate the system's integrity
        if (cluster_state == 'change_tracking'):
            if not self.gprecover.incremental():
                raise Exception('Gprecoverseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
            tinctest.logger.info('Segments recovered and back in sync')

        self.dbstate.check_mirrorintegrity()
        if self.config.has_master_mirror():
            self.dbstate.check_mirrorintegrity(master=True)

    def run_fault_injector_to_skip_checkpoint(self):
        tinctest.logger.info('Skip Checkpointing using fault injector.')
        self.fileutil.inject_fault(y = 'reset', f = 'checkpoint', r ='primary', H='ALL', m ='async', o = '0', p=self.port)
        (ok, out) = self.fileutil.inject_fault(y = 'skip', f = 'checkpoint', r ='primary', H='ALL', m ='async', o = '0', p=self.port)
        if not ok:
           raise Exception('Problem with injecting fault.')

    def backup_output_dir(self,test_dir, test_id):
        indir=local_path(test_dir)
        outdir = indir+'_'+test_id
        cmdstr="cp -r "+ indir + " " + outdir
        cmd = Command(name='run cp -r ', cmdStr=cmdstr)
        tinctest.logger.info("Taking a backup of SQL directory: %s" %cmd)
        try:
            cmd.run()
        except:
            self.fail("cp -r failed.")
        tinctest.logger.info("Test SQL directory Backup Done!!")

    def do_post_run_checks(self):
        self.stop_start_validate('sync')

        rc = self.gprecover.incremental()
        if not rc:
            raise Exception('Gprecvoerseg failed')

        self.gprecover.wait_till_insync_transition()

        tinctest.logger.info("Done going from resync to insync")
        self.dbstate.check_catalog(alldb=False)
        self.dbstate.check_mirrorintegrity()

        if self.config.has_master_mirror():
            self.dbstate.check_mirrorintegrity(master=True)
Example #31
0
 def test_gpcheckcat(self):
     tinctest.logger.info(
         'Run Checkcat to verify persistent table consistency')
     dbstate = DbStateClass('run_validation')
     dbstate.check_catalog(alldb=False, dbname=Steps.dbname)
Example #32
0
 def run_validation(self):
     tinctest.logger.info(
         'Veriy the integrity between primary and mirror ...')
     self.dbstate = DbStateClass('run_validation')
     self.dbstate.check_mirrorintegrity()
Example #33
0
 def check_mirror_seg(self):
     dbstate = DbStateClass('run_validation')
     dbstate.check_mirrorintegrity(master=True)
Example #34
0
 def check_mirror_seg(self, master=False):
     tinctest.logger.info("running check mirror")
     dbstate = DbStateClass('run_validation')
     dbstate.check_mirrorintegrity(master=master)
Example #35
0
class AOCOAlterColumn(MPPTestCase):
    
    def __init__(self):
        self.fileutil = Filerepe2e_Util()
        self.gprecover = GpRecover()
        self.config = GpConfig()
        self.base_dir = os.path.dirname(sys.modules[self.__class__.__module__].__file__)


    def get_sql_files(self, sql_file_name):
        sql_file = os.path.join( self.base_dir, "sql", sql_file_name + ".sql");    
        return  sql_file

    def validate_sql(self, ans_file, out_file):
        ''' Compare the out and ans files '''
        init_file=os.path.join( self.base_dir, "sql",'init_file')
        result1 = Gpdiff.are_files_equal(out_file, ans_file, match_sub =[init_file])
        self.assertTrue(result1 ,'Gpdiff.are_files_equal')        

    def run_sql(self, filename, out_file,background=False):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(filename,out_file=out_file,background=background)


    def run_test_CatalogCheck(self, action,storage):
        file_name =action+'_'+storage
        sql_file = self.get_sql_files(file_name)
        out_file = self.base_dir+ "/sql/"+file_name+'.out'
        tinctest.logger.info( 'sql-file == %s \n' % sql_file)
        tinctest.logger.info( 'out-file == %s \n' % out_file)
        # Run Add/Drop Column script
        self.run_sql(sql_file, out_file=out_file)

    def validate_test_CatalogCheck(self, action,storage):
        file_name =action+'_'+storage
        out_file = self.base_dir+ "/sql/"+file_name+'.out'
        ans_file = self.base_dir+ "/expected/"+file_name+'.ans'
        tinctest.logger.info( 'out-file == %s \n' % out_file)
        tinctest.logger.info( 'ans-file == %s \n' % ans_file)
        # Validate Ans file
        self.validate_sql(ans_file,out_file)
        if storage == 'multisegfiles':
            ''' check if multi_segfile_tab file has  multiple segfiles per column '''
            tablename='multi_segfile_tab'
            relid = self.get_relid(file_name=tablename )
            utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid)
            u_port=utilitymodeinfo[0]
            u_host=utilitymodeinfo[1]
            assert(1 < int(self.get_segment_cnt(relid=relid,host=u_host,port= u_port)))
        # Check Correctness of the catalog
        self.dbstate = DbStateClass('run_validation')
        outfile = local_path("gpcheckcat_"+datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d%H%M%S')+".out")
        self.dbstate.check_catalog(outputFile=outfile)

    def run_test_ChangeTracking(self,filename):
        # Log the segment state before starting the test
        # Expectation is a SYNC state
        self.log_segment_state()
        primary_dbid=self.get_dbid()
        # Run the 'alter table add column cmmand' in the background
        self.run_sql_ChangeTracking(filename,stage='fail',validate=False,background=True)
        # Inject Fault to put one primary in panic
        self.fileutil.inject_fault(f='postmaster', y='reset',  seg_id=primary_dbid)
        self.fileutil.inject_fault(f='postmaster', y='panic',  seg_id=primary_dbid)
        state=self.fileutil.check_fault_status(fault_name='postmaster', status='triggered')
        self.log_segment_state()
        # Recover the down segments
        self.recover_seg()
        self.log_segment_state()
        # Validate that the previous alter failed because primary segment went down as the alter was taking place
        self.run_sql_ChangeTracking(filename,stage='failvalidate',validate=True,background=False) 
        # Now the system is in change tracking so the next alter should pass
        self.run_sql_ChangeTracking(filename,stage='pass',validate=True,background=False) 
        self.log_segment_state()


    def recover_seg(self):
        result=self.get_segcount_state(state='d')
        if result > 0:
            if not self.gprecover.incremental():
                raise Exception('Gprecoverseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
        tinctest.logger.info('Segments recovered and back in sync')
        

    def run_sql_ChangeTracking(self,filename,stage,validate=False,background=False):
        fname=filename+'-'+stage
        sql_file = self.get_sql_files(fname)
        out_file = self.base_dir+ "/sql/"+fname +'.out'
        ans_file = self.base_dir+ "/expected/"+fname+'.ans'
        tinctest.logger.info( '\n==============stage = %s ================' % (stage))
        tinctest.logger.info( sql_file)
        tinctest.logger.info( out_file)
        tinctest.logger.info( ans_file)
        tinctest.logger.info( '==============================')
        result=self.run_sql(sql_file,out_file=out_file,background=background)
        if validate == True:
           self.validate_sql(ans_file,out_file)

 
    def get_dbid(self):
        sql_cmd = "select min(dbid) dbid from gp_segment_configuration where role = 'p' and status = 'u' and content > -1"
        dbid=PSQL.run_sql_command(sql_cmd= sql_cmd,flags='-q -t')
        tinctest.logger.info('Segments %s chosen for fault injection' % (dbid))
        return dbid
     
    def log_segment_state(self):
        sql_cmd = "select * from gp_segment_configuration order by dbid"
        result=PSQL.run_sql_command(sql_cmd= sql_cmd)
        tinctest.logger.info('==========================')
        tinctest.logger.info('State of Segments ')
        tinctest.logger.info(result)
        tinctest.logger.info('==========================')

    def get_segcount_state(self,state):
        sql_cmd = "select count(*) from gp_segment_configuration where status = '%s'" % (state)
        result=PSQL.run_sql_command(sql_cmd= sql_cmd,flags='-q -t')
        tinctest.logger.info('Number of segments in %s State == %d' % (state,(int(result))))
        return int(result)

    def get_utilitymode_conn_info(self, relid=0):
        #get the segment_id where to log in utility mode and then get the hostname and port for this segment
        sql_cmd="select port, hostname from gp_segment_configuration sc  where dbid > 1 and role = 'p' limit 1;"
        utilitymodeinfo=PSQL.run_sql_command(sql_cmd=sql_cmd,  flags='-q -t')
        u_port=utilitymodeinfo.strip().split('|')[0]
        u_host=utilitymodeinfo.strip().split('|')[1]
        return [u_port,u_host]

    def get_relid(self,file_name=None):
        sql_cmd="SELECT oid FROM pg_class WHERE relname='%s';\n" % file_name
        relid= PSQL.run_sql_command(sql_cmd=sql_cmd,  flags='-q -t')
        return relid;

    def get_segment_cnt(self, relid=0,host=None,port=None):
        sql_cmd="select count(*) from gp_toolkit.__gp_aocsseg(%s) group by column_num having count(*) > 1 limit 1" % (relid)
        segcnt=PSQL.run_sql_command_utility_mode(sql_cmd=sql_cmd,host=host, port=port,flags='-q -t')
        if (len(segcnt.strip()) == 0):
            segcnt='0'
        return segcnt

    def run_test_utility_mode(self,filename):
        #alter_aoco_tab_utilitymode
        relid = self.get_relid(file_name=filename )
        utilitymodeinfo=self.get_utilitymode_conn_info( relid=relid)
        u_port=utilitymodeinfo[0]
        u_host=utilitymodeinfo[1]
        self.run_sql_utility_mode(filename,host=u_host,port=u_port)

    
    def run_sql_utility_mode(self,filename,host=None,port=None):
        fname=filename
        sql_file = self.get_sql_files(fname)
        out_file = self.base_dir+ "/sql/"+fname +'.out'
        ans_file = self.base_dir+ "/expected/"+fname+'.ans'
        tinctest.logger.info( '\n==============================')
        tinctest.logger.info( sql_file)
        tinctest.logger.info( out_file)
        tinctest.logger.info( ans_file)
        tinctest.logger.info( '==============================')
        result=PSQL.run_sql_file_utility_mode(sql_file,out_file=out_file,host=host, port=port)
        self.validate_sql(ans_file,out_file)
Example #36
0
 def test_verify_catalog(self):
     dbstate = DbStateClass('run_validation')
     dbstate.check_catalog(alldb=False, dbname=catalog_consistency.dbname)
Example #37
0
    def test_mpp23395(self):
        """
        
        @description Test MPP-20964, uncleaned lock table by pg_terminate_backend
        @product_version gpdb: [4.3.3.1-],[4.2.8.5-4.2.99.99]
        """
        self.util = Filerepe2e_Util()

        (ok, out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared',
                                           y='reset',
                                           seg_id=1)
        if not ok:
            raise Exception(
                "Failed to reset the fault dtm_broadcast_commit_prepared")

        # setup
        PSQL.run_sql_command("""
          DROP TABLE IF EXISTS mpp23395;
          """)

        # Scenario 1: FAULT during Create Table on master
        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1)

        # Scenario 2: FAULT during Drop Table on master, COMMIT case
        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_commit_prepared', 'fatal', 1)

        (ok, out) = self.util.inject_fault(f='dtm_broadcast_commit_prepared',
                                           y='reset',
                                           seg_id=1)
        if not ok:
            raise Exception(
                "Failed to reset the fault dtm_broadcast_commit_prepared")

        # Scenario 3: FAULT during Create Table on segment, COMMIT case
        sql = '''
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error',
                          2)

        # Scenario 4: FAULT during Drop Table on segment, COMMIT case
        sql = '''
        SET debug_dtm_action_target = "protocol";
        SET debug_dtm_action_protocol = "commit_prepared";
        SET debug_dtm_action_segment = 0;
        SET debug_dtm_action = "fail_begin_command";
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'twophase_transaction_commit_prepared', 'error',
                          2)

        # Scenario 5: FAULT during Create Table on master, ABORT case
        (ok, out) = self.util.inject_fault(
            f='transaction_abort_after_distributed_prepared',
            y='error',
            seg_id=1)
        if not ok:
            raise Exception(
                "Failed to set the error fault for transaction_abort_after_distributed_prepared"
            )

        sql = '''
        CREATE TABLE mpp23395(a int);
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1)

        (ok, out) = self.util.inject_fault(
            f='transaction_abort_after_distributed_prepared',
            y='reset',
            seg_id=1)
        if not ok:
            raise Exception(
                "Failed to reset the fault transaction_abort_after_distributed_prepared"
            )

        PSQL.run_sql_command("""
        CREATE TABLE mpp23395(a int);
          """)

        # Scenario 6: FAULT during Drop Table on master, ABORT case
        (ok, out) = self.util.inject_fault(
            f='transaction_abort_after_distributed_prepared',
            y='error',
            seg_id=1)
        if not ok:
            raise Exception(
                "Failed to set the error fault for transaction_abort_after_distributed_prepared"
            )

        sql = '''
        DROP TABLE mpp23395;
        '''
        self.run_sequence(sql, 'dtm_broadcast_abort_prepared', 'fatal', 1)

        (ok, out) = self.util.inject_fault(
            f='transaction_abort_after_distributed_prepared',
            y='reset',
            seg_id=1)
        if not ok:
            raise Exception(
                "Failed to reset the fault transaction_abort_after_distributed_prepared"
            )

        PSQL.run_sql_command("""
        DROP TABLE mpp23395;
          """)

        dbstate = DbStateClass('run_validation')
        dbstate.check_catalog()
Example #38
0
class AOCOAlterColumn(MPPTestCase):
    def __init__(self):
        self.fileutil = Filerepe2e_Util()
        self.gprecover = GpRecover()
        self.config = GpConfig()
        self.base_dir = os.path.dirname(
            sys.modules[self.__class__.__module__].__file__)

    def get_sql_files(self, sql_file_name):
        sql_file = os.path.join(self.base_dir, "sql", sql_file_name + ".sql")
        return sql_file

    def validate_sql(self, ans_file, out_file):
        ''' Compare the out and ans files '''
        init_file = os.path.join(self.base_dir, "sql", 'init_file')
        result1 = Gpdiff.are_files_equal(out_file,
                                         ans_file,
                                         match_sub=[init_file])
        self.assertTrue(result1, 'Gpdiff.are_files_equal')

    def run_sql(self, filename, out_file, background=False):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(filename, out_file=out_file, background=background)

    def run_test_CatalogCheck(self, action, storage):
        file_name = action + '_' + storage
        sql_file = self.get_sql_files(file_name)
        out_file = self.base_dir + "/sql/" + file_name + '.out'
        tinctest.logger.info('sql-file == %s \n' % sql_file)
        tinctest.logger.info('out-file == %s \n' % out_file)
        # Run Add/Drop Column script
        self.run_sql(sql_file, out_file=out_file)

    def validate_test_CatalogCheck(self, action, storage):
        file_name = action + '_' + storage
        out_file = self.base_dir + "/sql/" + file_name + '.out'
        ans_file = self.base_dir + "/expected/" + file_name + '.ans'
        tinctest.logger.info('out-file == %s \n' % out_file)
        tinctest.logger.info('ans-file == %s \n' % ans_file)
        # Validate Ans file
        self.validate_sql(ans_file, out_file)
        if storage == 'multisegfiles':
            ''' check if multi_segfile_tab file has  multiple segfiles per column '''
            tablename = 'multi_segfile_tab'
            relid = self.get_relid(file_name=tablename)
            utilitymodeinfo = self.get_utilitymode_conn_info(relid=relid)
            u_port = utilitymodeinfo[0]
            u_host = utilitymodeinfo[1]
            assert (1 < int(
                self.get_segment_cnt(relid=relid, host=u_host, port=u_port)))
        # Check Correctness of the catalog
        self.dbstate = DbStateClass('run_validation')
        outfile = local_path("gpcheckcat_" + datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y%m%d%H%M%S') + ".out")
        self.dbstate.check_catalog(outputFile=outfile)

    def get_dbid(self):
        sql_cmd = "select min(dbid) dbid from gp_segment_configuration where role = 'p' and status = 'u' and content > -1"
        dbid = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t')
        tinctest.logger.info('Segments %s chosen for fault injection' % (dbid))
        return dbid

    def log_segment_state(self):
        sql_cmd = "select * from gp_segment_configuration order by dbid"
        result = PSQL.run_sql_command(sql_cmd=sql_cmd)
        tinctest.logger.info('==========================')
        tinctest.logger.info('State of Segments ')
        tinctest.logger.info(result)
        tinctest.logger.info('==========================')

    def get_segcount_state(self, state):
        sql_cmd = "select count(*) from gp_segment_configuration where status = '%s'" % (
            state)
        result = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t')
        tinctest.logger.info('Number of segments in %s State == %d' %
                             (state, (int(result))))
        return int(result)

    def get_utilitymode_conn_info(self, relid=0):
        #get the segment_id where to log in utility mode and then get the hostname and port for this segment
        sql_cmd = "select port, hostname from gp_segment_configuration sc  where dbid > 1 and role = 'p' limit 1;"
        utilitymodeinfo = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t')
        u_port = utilitymodeinfo.strip().split('|')[0]
        u_host = utilitymodeinfo.strip().split('|')[1]
        return [u_port, u_host]

    def get_relid(self, file_name=None):
        sql_cmd = "SELECT oid FROM pg_class WHERE relname='%s';\n" % file_name
        relid = PSQL.run_sql_command(sql_cmd=sql_cmd, flags='-q -t')
        return relid

    def get_segment_cnt(self, relid=0, host=None, port=None):
        sql_cmd = "select count(*) from gp_toolkit.__gp_aocsseg(%s) group by column_num having count(*) > 1 limit 1" % (
            relid)
        segcnt = PSQL.run_sql_command_utility_mode(sql_cmd=sql_cmd,
                                                   host=host,
                                                   port=port,
                                                   flags='-q -t')
        if (len(segcnt.strip()) == 0):
            segcnt = '0'
        return segcnt

    def run_test_utility_mode(self, filename):
        #alter_aoco_tab_utilitymode
        relid = self.get_relid(file_name=filename)
        utilitymodeinfo = self.get_utilitymode_conn_info(relid=relid)
        u_port = utilitymodeinfo[0]
        u_host = utilitymodeinfo[1]
        self.run_sql_utility_mode(filename, host=u_host, port=u_port)

    def run_sql_utility_mode(self, filename, host=None, port=None):
        fname = filename
        sql_file = self.get_sql_files(fname)
        out_file = self.base_dir + "/sql/" + fname + '.out'
        ans_file = self.base_dir + "/expected/" + fname + '.ans'
        tinctest.logger.info('\n==============================')
        tinctest.logger.info(sql_file)
        tinctest.logger.info(out_file)
        tinctest.logger.info(ans_file)
        tinctest.logger.info('==============================')
        result = PSQL.run_sql_file_utility_mode(sql_file,
                                                out_file=out_file,
                                                host=host,
                                                port=port)
        self.validate_sql(ans_file, out_file)