Example #1
0
    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        if not expect_down_segments:
            if not ok:
                raise Exception(
                    '[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info(
                "[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("[STLRTest]segments were marked down")
        else:
            return (True, "All segments are up")
Example #2
0
 def __init__(self, methodName):
     self.pgport = os.environ.get('PGPORT')
     self.fileutil = Filerepe2e_Util()
     self.gpstate = Gpstate()
     self.gpprimarymirror = Gpprimarymirror()
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     super(FtsTransitions,self).__init__(methodName)
Example #3
0
	def setUp(self):
                self.basedir = os.path.dirname(__file__)
		self.gphome = os.environ.get('GPHOME')
                self.gp=GpStart()
                self.gps=GpStop()
                self.MAX_TRY=3
                self.TIMEOUT=90
                self.MAXPARALLELSEG=60
Example #4
0
 def __init__(self, methodName):
     self.filereputil = Filerepe2e_Util()
     self.config = GPDBConfig()
     self.gprecover = GpRecover(self.config)
     self.gpstop = GpStop()
     self.gpstart = GpStart()
     self.gpverify = GpdbVerify(config=self.config)
     self.dbstate = DbStateClass('run_validation', self.config)
     self.port = os.getenv('PGPORT')
     super(PgtwoPhaseClass, self).__init__(methodName)
Example #5
0
 def __init__(self, methodName):
     self.pgport = os.environ.get('PGPORT')
     self.util = Filerepe2e_Util()
     self.gpconfig = GpConfig()
     self.config = GPDBConfig()
     self.gpr = GpRecover(self.config)
     self.dbstate = DbStateClass('run_validation', self.config)
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     super(FilerepTestCase, self).__init__(methodName)
Example #6
0
    def start_db(self):
        """
        @summary: Start the greenplum DB based on the options provided 
        
        @param option: represents different gpstart command options
        @return: output of the executed command as a string
        """

        gpstart = GpStart()
        gpstart.run_gpstart_cmd()
Example #7
0
 def run_restart_database(self):
     '''
     @summary : Restart the database
     '''
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     tinctest.logger.info("[STLRTest] Running run_restart_database")
     ok = self.gpstop.run_gpstop_cmd(immediate='i')
     tinctest.logger.info(ok)
     ok = self.gpstart.run_gpstart_cmd()
     tinctest.logger.info(ok)
Example #8
0
 def __init__(self,methodName):
     self.fileutil = Filerepe2e_Util()
     self.config = GPDBConfig()
     self.gprecover = GpRecover(self.config)
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     self.gpfile = Gpfilespace(self.config)
     self.dbstate = DbStateClass('run_validation', self.config)
     self.port = os.getenv('PGPORT')
     self.base = GPDBStorageBaseTestCase()
     super(SuspendCheckpointCrashRecovery,self).__init__(methodName)
Example #9
0
    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = GPDBConfig()

        self.filereputil = Filerepe2e_Util()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')
Example #10
0
 def run_restart_database(self):
     '''
     @summary : Restart the database
     '''
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     tinctest.logger.info("[STLRTest] Running run_restart_database")   
     ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
     tinctest.logger.info(ok)
     ok = self.gpstart.run_gpstart_cmd()
     tinctest.logger.info(ok)       
     tinctest.logger.info("[STLRTest] printing gp segment configuration")
     (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid")
     tinctest.logger.info(gp_seg_conf)
Example #11
0
    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")   

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        if not expect_down_segments:
            if not ok:
                raise Exception('[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info("[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("[STLRTest]segments were marked down")
        else:
           return (True, "All segments are up")
Example #12
0
 def setUp(self):
     self.basedir = os.path.dirname(__file__)
     self.gphome = os.environ.get("GPHOME")
     self.gp = GpStart()
     self.gps = GpStop()
     self.MAX_TRY = 3
     self.TIMEOUT = 90
     self.MAXPARALLELSEG = 60
Example #13
0
 def __init__(self):
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     self.config = GpConfig()
     self.port = os.getenv('PGPORT')
     self.gphome = (os.getenv('GPHOME'))
     self.base_dir = os.path.dirname(
         sys.modules[self.__class__.__module__].__file__)
Example #14
0
 def __init__(self, methodName):    
     self.pgport = os.environ.get('PGPORT')
     self.util = Filerepe2e_Util()
     self.gpconfig = GpConfig()
     self.config = GPDBConfig()
     self.gpr = GpRecover(self.config)
     self.dbstate = DbStateClass('run_validation',self.config)
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     super(FilerepTestCase,self).__init__(methodName)
Example #15
0
 def __init__(self,methodName):
     self.filereputil = Filerepe2e_Util()
     self.config = GPDBConfig()
     self.gprecover = GpRecover(self.config)
     self.gpstop = GpStop()
     self.gpstart = GpStart()
     self.gpfile = Gpfilespace(self.config)
     self.gpverify = GpdbVerify(config=self.config)
     self.dbstate = DbStateClass('run_validation',self.config)
     self.port = os.getenv('PGPORT')
     super(PgtwoPhaseClass,self).__init__(methodName)
Example #16
0
 def run_restart_database(self):
     '''
     @summary : Restart the database
     '''
     self.gpstart = GpStart()
     self.gpstop = GpStop()
     tinctest.logger.info("[STLRTest] Running run_restart_database")   
     ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
     tinctest.logger.info(ok)
     ok = self.gpstart.run_gpstart_cmd()
     tinctest.logger.info(ok)       
    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = GPDBConfig()

        self.filereputil = Filerepe2e_Util()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpfile = Gpfilespace(self.config)
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')
Example #18
0
class FilerepTestCase(MPPTestCase):
    def __init__(self, methodName):
        self.pgport = os.environ.get('PGPORT')
        self.util = Filerepe2e_Util()
        self.gpconfig = GpConfig()
        self.config = GPDBConfig()
        self.gpr = GpRecover(self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FilerepTestCase, self).__init__(methodName)

    def sleep(self, seconds=60):
        time.sleep(seconds)

    def create_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('create a file',
                      'touch %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def remove_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('remove a file',
                      'rm %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def get_timestamp_of_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command(
            'check timestamp',
            """ python -c "import os; print os.stat('%s').st_mtime" """ %
            file_path,
            ctxt=REMOTE,
            remoteHost=host)
        cmd.run(validateAfter=True)
        res = cmd.get_results().stdout.strip()
        return res

    def verify_file_exists(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check if file exists',
                      'test -f %s' % file_path,
                      ctxt=REMOTE,
                      remoteHost=host)
        cmd.run(validateAfter=True)

    def handle_ext_cases(self, file):
        """
        @file: wet sql file to replace with specific machine env.
        """

        host = str(socket.gethostbyname(socket.gethostname()))  #Must be an IP
        querystring = "gpfdist://" + host + ":8088"

        if os.path.isfile(file):
            for line in fileinput.FileInput(file, inplace=1):
                line = re.sub('gpfdist.+8088', querystring, line)
                print str(re.sub('\n', '', line))

    def handle_hybrid_part_cases(self, file):
        """
        @file: hybrid sql file to replace with specific machine env
        """

        querystring = "FROM '" + local_path('hybrid_part.data') + "'"
        if os.path.isfile(file):
            for line in fileinput.FileInput(file, inplace=1):
                line = re.sub('FROM\s\'.+hybrid_part.data\'', querystring,
                              line)
                print str(re.sub('\n', '', line))

    def preprocess(self):
        """ 
        Replace the hard-coded information from sql files with correct hostname and ip address,etc 
        """

        list_workload_dir = [
            'set_sync1', 'sync1', 'set_ck_sync1', 'ck_sync1', 'set_ct', 'ct',
            'set_resync', 'resync', 'set_sync2', 'sync2'
        ]
        for dir in list_workload_dir:
            sql_path = os.path.join(local_path(dir), 'sql')
            ans_path = os.path.join(local_path(dir), 'expected')
            for file in os.listdir(sql_path):
                if (file.find('wet_ret') >= 0):
                    self.handle_ext_cases(os.path.join(sql_path, file))
                if (file.find('hybrid_part') >= 0):
                    self.handle_hybrid_part_cases(os.path.join(sql_path, file))
            for file in os.listdir(ans_path):
                if (file.find('wet_ret') >= 0):
                    self.handle_ext_cases(os.path.join(ans_path, file))
                if (file.find('hybrid_part') >= 0):
                    self.handle_hybrid_part_cases(os.path.join(ans_path, file))

    def clean_data(self):
        """ 
        Clean the data by removing the external table, otherwise, more data will be appended to the
        same external table from running multiple sql files. 
        """

        test = local_path("")
        test = str(test) + "data/*.*"

        cmd = 'rm -rfv ' + test
        run_shell_command(cmd)

    def anydownsegments(self):
        """
        checks if any segments are down
        """

        tinctest.logger.info("Checking if any segments are down")
        num_segments_down = self.count_of_nodes_down()
        if int(num_segments_down) == 0:
            return True
        else:
            return False

    def stop_start_validate(self, stopValidate=True):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """

        tinctest.logger.info("Performing stop start validate")
        tinctest.logger.info("Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i', validate=stopValidate)
        if not ok and stopValidate:
            raise Exception('Problem while shutting down the cluster')
        tinctest.logger.info("Successfully shutdown the cluster.")

        tinctest.logger.info("Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failed to bring the cluster back up')
        tinctest.logger.info("Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("segments were marked down")
        else:
            return (True, "All segments are up")

    def method_reset_fault_injection(self):
        """
        Resets fault injection
        Return: (True, [result]) if OK, or (False, [result]) otherwise
        """

        tinctest.logger.info("Resetting fault injection")

        (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')
        if not ok1:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done Injecting Fault  to reset resync")

        return (True, str(out1))

    def method_resume_filerep_resync(self):
        """
        Resumes the process of resync
        """

        tinctest.logger.info("Resuming Resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync',
                                           m='async',
                                           y='resume',
                                           r='primary',
                                           H='ALL')
        if not ok:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done resuming resync")
        return (ok, out)

    def run_method_suspendresync(self):
        """
        Stops the cluster from going to resync
        """

        tinctest.logger.info("Suspending resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync',
                                           m='async',
                                           y='suspend',
                                           r='primary',
                                           H='ALL')
        tinctest.logger.info('output from suspend resync %s' % out)
        if not ok:
            raise Exception("Fault injection failed")
        tinctest.logger.info("Done Injecting Fault to suspend resync")
        return (ok, out)

    def count_of_masters(self):
        """
        Gives count of number of nodes in the cluster that are master 
        Return: count of number of nodes in the cluster that are master
        """

        tinctest.logger.info("Count the number of masters")
        cmd = "select count(*) from gp_segment_configuration where content = -1"
        (out) = PSQL.run_sql_command(cmd)
        num_master = out.split('\n')[3].strip()
        return num_master

    def count_of_nodes(self):
        """
        Gives count of number of nodes in the cluster
        Return: count of number of nodes in the cluster
        """

        tinctest.logger.info("Counting number of nodes")
        cmd = "select count(*) from gp_segment_configuration"
        (num_cl) = PSQL.run_sql_command(cmd)
        total_num_rows = num_cl.split('\n')[3].strip()
        return total_num_rows

    def count_of_nodes_in_ct(self):
        """
        Gives count of number of nodes in change tracking
        Return: count of number of nodes in change tracking
        """

        tinctest.logger.info("Counting number of nodes in ct")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'"
        (num_cl) = PSQL.run_sql_command(sqlcmd)
        num_cl = num_cl.split('\n')[3].strip()
        return num_cl

    def count_of_nodes_down(self):
        """
        Gives count of number of nodes marked as down
        Return: count of number of nodes marked as down
        """

        tinctest.logger.info("Counting the number of nodes down")
        sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'"
        (num_down) = PSQL.run_sql_command(sqlcmd)
        num_down = num_down.split('\n')[3].strip()
        return num_down

    def count_of_nodes_sync(self):
        """
        Gives count of number of nodes in sync
        Return: count of number of nodes in sync
        """

        tinctest.logger.info("Counting the number of nodes in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def count_of_nodes_not_sync(self):
        """
        Gives count of number of nodes not in sync
        Return: count of number of nodes not in sync
        """

        tinctest.logger.info("Counting number of nodes not in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def inject_fault_on_first_primary(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        tinctest.logger.info("\n Injecting faults on first primary")
        (ok,
         out) = self.util.inject_fault(f='filerep_immediate_shutdown_request',
                                       m='async',
                                       y='infinite_loop',
                                       r='primary',
                                       seg_id=2,
                                       sleeptime=300)
        if not ok:
            raise Exception(
                "Fault filerep_immediate_shutdown_request injection failed")

        (ok, out) = self.util.inject_fault(f='fileRep_is_operation_completed',
                                           m='async',
                                           y='infinite_loop',
                                           r='primary',
                                           seg_id=2)
        if not ok:
            raise Exception(
                "Fault fileRep_is_operation_completed injection failed")
        tinctest.logger.info("\n Done Injecting Fault")

    def inject_fault_on_first_mirror(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'"
        (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd)
        first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip()

        tinctest.logger.info("\n Injecting faults on first mirror")
        flag = self.util.check_fault_status(
            fault_name='fileRep_is_operation_completed',
            status='triggered',
            max_cycle=100)
        if not flag:
            raise Exception(
                "Fault fileRep_is_operation_completed didn't trigger")

        (ok, out) = self.util.inject_fault(f='filerep_consumer',
                                           m='async',
                                           y='panic',
                                           r='mirror',
                                           seg_id=first_mirror_dbid)
        if not ok:
            raise Exception("Fault filerep_consumer injection failed")
        tinctest.logger.info("\n Done Injecting Fault")

    def setupGpfdist(self, port, path):
        gpfdist = Gpfdist(port, self.hostIP())
        gpfdist.killGpfdist()
        gpfdist.startGpfdist(' -t 30 -m 1048576 -d ' + path)
        return True

    def cleanupGpfdist(self, port, path):
        gpfdist = Gpfdist(port, self.hostIP())
        gpfdist.killGpfdist()
        return True

    def hostIP(self):
        ok = run_shell_command('which gpfdist')
        if not ok:
            raise GPtestError("Error:'which gpfdist' command failed.")
        hostname = socket.gethostname()
        if hostname.find('mdw') > 0:
            host = 'mdw'
        else:
            host = str(socket.gethostbyname(
                socket.gethostname()))  #Must be an IP
        tinctest.logger.info('current host is %s' % host)
        return host

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs = Gpfilespace()
        gpfs.create_filespace('filerep_fs_a')
        gpfs.create_filespace('filerep_fs_b')
        gpfs.create_filespace('filerep_fs_c')
        gpfs.create_filespace('filerep_fs_z')
        gpfs.create_filespace('sync1_fs_1')

        # Set max_resource_queues to 100
        cmd = 'gpconfig -c max_resource_queues -v 100 '
        ok = run_shell_command(cmd)
        if not ok:
            raise Exception(
                'Failure during setting the max_resource_queues value to 100 using gpconfig tool'
            )
        #Restart the cluster
        self.gpstop.run_gpstop_cmd(immediate='i')
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failure during restarting the cluster')
        return True

    def get_ext_table_query_from_gpstate(self):
        outfile = local_path("gpstate_tmp")
        ok = run_shell_command("gpstate --printSampleExternalTableSql >" +
                               outfile)
        querystring = ""
        flag = 'false'
        out = open(outfile, 'r').readlines()
        for line in out:
            line.strip()
            if (line.find(
                    'DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status') >=
                    0):
                flag = 'true'
            if flag == 'true':
                querystring = querystring + line
        return querystring  ############RUN QYUERY

    def check_gpstate(self, type, phase):
        """ 
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        if phase == 'sync1':
            state_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'"
            )
            sync1_num = self.query_select_count(
                "select count(*) from gp_segment_configuration where content <> -1"
            )
            if int(sync1_num) <> int(state_num):
                raise Exception("gpstate in Sync state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'ct':
            p_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking'  and role = 'Primary' and status_in_config='Up' and instance_status='Up'"
            )
            m_num = self.query_select_count(
                "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync'  and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' "
            )

            if int(p_num) <> int(m_num):
                raise Exception("gpstate in CT state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'resync_incr':

            if type == 'primary':
                query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            else:
                query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)

            query_num_rows = "select count(*) from gp_segment_configuration where content <> -1"
            num_rows = self.query_select_count(query_num_rows)

            if int(resync_incr_num) <> int(num_rows):
                tinctest.logger.info("resync_incr_num query run %s" % query)
                tinctest.logger.info("num_rows query run %s" % query_num_rows)
                raise Exception(
                    "gpstate in Resync Incremental  state failed. resync_incr_num %s <> num_rows %s"
                    % (resync_incr_num, num_rows))
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        elif phase == 'resync_full':
            num_rows = self.query_select_count(
                "select count(*) from gp_segment_configuration where content <> -1"
            )

            if type == 'primary':
                resync_full_num = self.query_select_count(
                    "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'"
                )
            else:
                resync_full_num = self.query_select_count(
                    "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'"
                )

            if int(resync_full_num) <> int(num_rows):
                raise Exception("gptate in Resync Full state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " % (phase))

        return True

    def trigger_transition(self):
        PSQL.run_sql_file(local_path('mirrors.sql'))

    def run_gpstate(self, type, phase):
        """
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        tinctest.logger.info("running gpstate")
        querystring = self.get_ext_table_query_from_gpstate()
        file1 = local_path('create_table_gpstate.sql')
        f1 = open(file1, 'w')
        f1.write(querystring)
        f1.write('\n')
        f1.close()
        PSQL.run_sql_file(local_path('create_table_gpstate.sql'))

        gpstate_outfile = local_path('gpstate_out')
        cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile)

        ok = run_shell_command(cmd)
        self.check_gpstate(type, phase)
        return ok

    def check_mirror_seg(self, master=False):
        tinctest.logger.info("running check mirror")
        self.dbstate.check_mirrorintegrity()

    def do_gpcheckcat(self,
                      dbname=None,
                      alldb=False,
                      online=False,
                      outputFile='checkcat.out',
                      outdir=None):
        tinctest.logger.info("running gpcheckcat")
        self.dbstate.check_catalog(outputFile=outputFile)

    def query_select_count(self, sqlcmd):
        (num) = PSQL.run_sql_command(sqlcmd)
        num = num.split('\n')[3].strip()
        return num

    def method_run_failover(self, type):
        """
        Inject fault to failover nodes
        @type: primary [induces fault in mirror] mirror [creates panic in primary]   
        Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise
        """

        if type == 'primary':
            tinctest.logger.info("\n primary failover")
            (ok, out) = self.util.inject_fault(f='filerep_consumer',
                                               m='async',
                                               y='fault',
                                               r='mirror',
                                               H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")

        elif type == 'mirror':
            tinctest.logger.info("\n Mirror failover")
            (ok, out) = self.util.inject_fault(f='postmaster',
                                               m='async',
                                               y='panic',
                                               r='primary',
                                               H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")
        return True

    def wait_till_change_tracking_transition(self):
        self.util.wait_till_change_tracking_transition()

    def wait_till_insync_transition(self):
        self.gpr.wait_till_insync_transition()

    def run_gprecoverseg(self, recover_mode):
        if recover_mode == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

    def run_gpconfig(self, parameter, master_value, segment_value):
        if (parameter is not None):
            self.gpconfig.setParameter(parameter, master_value, segment_value)
            self.gpstop.run_gpstop_cmd(restart='r')

    def inject_fault(self,
                     fault=None,
                     mode=None,
                     operation=None,
                     prim_mirr=None,
                     host='All',
                     table=None,
                     database=None,
                     seg_id=None,
                     sleeptime=None,
                     occurence=None):
        if (fault == None or mode == None or operation == None
                or prim_mirr == None):
            raise Exception('Incorrect parameters provided for inject fault')

        (ok, out) = self.util.inject_fault(f=fault,
                                           m=mode,
                                           y=operation,
                                           r=prim_mirr,
                                           H='ALL',
                                           table=table,
                                           database=database,
                                           sleeptime=sleeptime,
                                           o=occurence,
                                           seg_id=seg_id)
Example #19
0
class gpStartTestCase(unittest.TestCase):
    def setUp(self):
        self.basedir = os.path.dirname(__file__)
        self.gphome = os.environ.get("GPHOME")
        self.gp = GpStart()
        self.gps = GpStop()
        self.MAX_TRY = 3
        self.TIMEOUT = 90
        self.MAXPARALLELSEG = 60

    def test_gpstart_logDir(self):
        tinctest.logger.info("Running test_gpstart_logDir")
        self.logdir = "".join([self.basedir, "/logs"])
        cmd = Command(name="Remove gpstop<nnnn>.log", cmdStr="rm -f %s/gpstop*" % (self.logdir))
        tinctest.logger.info("Removing gpstop<nnnn>.log : %s" % cmd)
        cmd.run(validateAfter=True)
        result = cmd.get_results()
        if result.rc != 0 or result.stderr:
            raise gpstopException("Not able to delete existing gpstop<nnnn>.log")
        lcmd = " ".join(["ls", self.logdir, "| wc -l"])
        res = False
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd(logdir=self.logdir)
        if res is not True:
            raise GPstopError("Error : run_gpstart_cmd(logdir) failed \n")
        cmd = Command(name="count of  gpstart<nnnn>.log", cmdStr=" %s " % (lcmd))
        tinctest.logger.info("Count gpstart<nnnn>.log : %s" % cmd)
        cmd.run(validateAfter=True)
        result = cmd.get_results()
        if result.rc != 0 or result.stderr:
            raise gpstopException("Not able to get count of gpstart<nnnn>.log")
        assert int(result.stdout) > 0

    def test_gpstart_getversion(self):
        res = self.gp.get_version()
        self.assertTrue(res)

    def test_gpstart_restrict(self):
        tinctest.logger.info("Running test_gpstart_restrict")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd(restrict="y")
        self.assertTrue(res)

    def test_gpstart_timeout(self):
        tinctest.logger.info("Running test_gpstart_timeout")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd(timeout=self.TIMEOUT)
        self.assertTrue(res)

    def test_gpstart_parallelproc(self):
        tinctest.logger.info("Running test_gpstart_parallelproc")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd(parallelproc=self.MAXPARALLELSEG)
        self.assertTrue(res)

    def test_gpstart_noprompt(self):
        tinctest.logger.info("Running test_gpstart_noprompt")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd()
        self.assertTrue(res)

    def test_gpstart_cmd_masterOnly(self):
        tinctest.logger.info("Running test_gpstart_cmd_masterOnly")
        if self.is_not_running_gpdb():
            self.gp.run_gpstart_cmd(masteronly="y")
        res = self.gpstartCheck()
        self.assertTrue(res)

    def test_gpstart_cmd_quiet(self):
        tinctest.logger.info("Running test_gpstart_cmd_quiet")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd(quietmode="y")
        self.assertTrue(res)

    def test_gpstart_cmd_startcluster(self):
        tinctest.logger.info("Running test_gpstart_cmd_startcluster")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd()
        self.assertTrue(res)

    def test_gpstart_cmd_verbose(self):
        tinctest.logger.info("Running test_gpstart_cmd_verbose")
        if self.is_not_running_gpdb():
            res = self.gp.run_gpstart_cmd(verbose="y")
        self.assertTrue(res)

    def test_gpstart_check(self):
        if not self.gpstartCheck():
            res2 = self.gp.gp.run_gpstart_cmd()
            res = self.gpstartCheck()
            self.assertTrue(res)

    def test_func_gpstart_quiet(self):
        if self.is_not_running_gpdb():
            res = self.gp.gpstart_quiet()
        self.assertTrue(res)

    def is_not_running_gpdb(self):
        res = False
        ctr = 0
        while ctr < self.MAX_TRY:
            ctr = ctr + 1
            res = self.gpstartCheck()
            if res is True:
                self.gps.run_gpstop_cmd(quietmode="y")
            else:
                return True
        if res is True and ctr < self.MAX_TRY:
            return True
        else:
            return False

    def gpstartCheck(self):
        """
        	Checks if the cluster is brought up correctly and all segments are in sync
        	"""
        bashCmd = (
            "source "
            + (self.gphome)
            + "/greenplum_path.sh;"
            + (self.gphome)
            + "/bin/pg_ctl status -D $MASTER_DATA_DIRECTORY | grep 'pg_ctl: server is running'"
        )
        dbStart = Command(name="gpstartCheck ", cmdStr=bashCmd)
        dbStart.run()
        rc = dbStart.get_results().rc
        if rc != 0:
            return False
        return True
Example #20
0
 def restart(self):
     # GpStop does not accept immedaite and restart both...
     GpStop().run_gpstop_cmd(immediate=True)
     GpStart().run_gpstart_cmd()
Example #21
0
class SubTransactionLimitRemovalTestCase(MPPTestCase):
    def __init__(self, methodName):
        super(SubTransactionLimitRemovalTestCase, self).__init__(methodName)

    def check_system(self):
        '''
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        tinctest.logger.info("[STLRTest] Running check_system")

        tinctest.logger.info(
            "[STLRTest] Check whether the system is up and sync")

        cmd = "select count(*) from gp_segment_configuration where content<> -1 ;"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_all = num_cl.split('\n')[3].strip()

        cmd = "select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_up_and_sync = num_cl.split('\n')[3].strip()
        tinctest.logger.info("[STLRTest] printing gp segment configuration")
        (gp_seg_conf) = PSQL.run_sql_command(
            "select * from gp_segment_configuration order by dbid")
        tinctest.logger.info(gp_seg_conf)

        if count_all != count_up_and_sync:
            raise Exception(
                "[STLRTest] System not in sync and up. Exiting test")
        else:
            tinctest.logger.info(
                "[STLRTest] Starting New Test: System is up and in sync...")

    def run_sqls(self, test):
        '''
        @summary : Run the sql 
        @param test: the sql file list
        '''
        tinctest.logger.info("[STLRTest] Running run_sqls")
        tinctest.logger.info("[STLRTest]Starting new thread to run sql %s" %
                             (test))
        PSQL.run_sql_file(local_path(test))

    def suspend_faults(self, fault_name):
        '''
        @summary : Suspend the specified fault: reset it before issuing suspend 
        @param fault_name : Name of the fault to suspend
        '''
        tinctest.logger.info("[STLRTest] Running suspend_faults")

        self.util = Filerepe2e_Util()

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the %s fault" %
                             (fault_name))

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='suspend',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done suspending the %s fault" %
                             (fault_name))

    def check_fault_status(self, fault_name=None, status=None, max_cycle=10):
        ''' 
        Check whether a fault is triggered. Poll till the fault is triggered
        @param name : Fault name
        @param status : Status to be checked - triggered/completed
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running check_fault_status %s",
                             status)

        if (not fault_name) or (not status):
            raise Exception(
                "[STLRTest]Need a value for fault_name and status to continue")

        poll = 0
        while (poll < max_cycle):
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='status',
                                                 r='primary',
                                                 H='ALL')
            poll += 1
            for line in out1.splitlines():
                if line.find(fault_name) > 0 and line.find(status) > 0:
                    tinctest.logger.info("[STLRTest]Fault %s is %s " %
                                         (fault_name, status))
                    poll = 0
                    tinctest.logger.info(
                        "[STLRTest] Running check_fault_status %s TRUE",
                        status)
                    return True

            #sleep a while before start polling again
            sleep(10)
        tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE",
                             status)
        return False

    def filerep_fault(self, trans_state):
        '''
        @summary : Inject the filerep fault supplied
        @param trans_state : type of transition 
        '''
        tinctest.logger.info("[STLRTest] Running filerep_fault")
        self.util = Filerepe2e_Util()

        if trans_state == 'failover_to_primary':
            tinctest.logger.info("[STLRTest] primary failover")
            (ok1, out1) = self.util.inject_fault(f='filerep_consumer',
                                                 m='async',
                                                 y='fault',
                                                 r='mirror',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done primary failover fault")

        elif trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for postmaster panic")
            (ok1, out1) = self.util.inject_fault(f='postmaster',
                                                 m='async',
                                                 y='panic',
                                                 r='primary',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done postmaster panic fault")

        elif trans_state == 'postmaster_reset':
            tinctest.logger.info("[STLRTest] fault for filerep_sender panic")
            (ok1, out1) = self.util.inject_fault(f='filerep_sender',
                                                 m='async',
                                                 y='panic',
                                                 r='primary',
                                                 H='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done filerep_sender panic fault")

        tinctest.logger.info("[STLRTest] Done Injecting Fault")

    def resume_faults(self, fault_name, trans_state):
        ''''
        @summary : Resume the fault and check status
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_faults")

        if not trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name)
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='resume',
                                                 r='primary',
                                                 H='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault resume failed")
            tinctest.logger.info("[STLRTest]Done fault for %s resume" %
                                 fault_name)

        if trans_state == 'postmaster_reset':
            (ok1, out1) = self.util.inject_fault(f=fault_name,
                                                 m='async',
                                                 y='resume',
                                                 r='mirror',
                                                 H='ALL')
            if not ok1:
                tinctest.logger.info(
                    "[STLRTest]Failed fault for %s resume on mirror" %
                    fault_name)

        if trans_state == 'failover_to_primary':
            self.check_fault_status(fault_name, 'completed')

    def checkPSQLRun(self, test):
        '''Check if the psql run started in parallel is over before running the _post.sql '''
        tinctest.logger.info("[STLRTest] Running checkPSQLRun")
        cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql'
        while (1):
            is_running = 0
            (rc, out) = shell.run(cmd_str)
            for line in out:
                if '%s' % test in line:
                    is_running = 1
            if is_running == 0:
                return True
            else:
                sleep(10)
        return False

    def resume_filerep_resync(self):
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_filerep_resync")

        tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume")
        (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                             m='async',
                                             y='resume',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info(
            "[STLRTest]Done fault for failover_to_mirror resume")
        sleep(10)

    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        if not expect_down_segments:
            if not ok:
                raise Exception(
                    '[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info(
                "[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
            raise Exception("[STLRTest]segments were marked down")
        else:
            return (True, "All segments are up")

    def run_gprecoverseg(self, recover_option):
        '''
        @summary : Call gpecoverseg full or incremental to bring back the cluster to sync
        '''
        self.gpr = GpRecover()

        tinctest.logger.info("[STLRTest] Running run_gprecoverseg")

        if recover_option == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

        self.gpr.wait_till_insync_transition()

    def run_restart_database(self):
        '''
        @summary : Restart the database
        '''
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running run_restart_database")
        ok = self.gpstop.run_gpstop_cmd(immediate='i')
        tinctest.logger.info(ok)
        ok = self.gpstart.run_gpstart_cmd()
        tinctest.logger.info(ok)

    def reset_faults(self, fault_name, current_cluster_state):
        ''''
        @summary : Reset the faults at the end of test 
        '''
        self.util = Filerepe2e_Util()
        tinctest.logger.info("[STLRTest] Running reset_faults")

        tinctest.logger.info("[STLRTest] Resetting fault before ending test")

        (ok1, out1) = self.util.inject_fault(f=fault_name,
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting %s fault" %
                             (fault_name))

        if current_cluster_state == 'resync':
            (ok1, out1) = self.util.inject_fault(f='filerep_resync',
                                                 m='async',
                                                 y='reset',
                                                 r='primary',
                                                 H='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")
            tinctest.logger.info("[STLRTest]Done filerep_resync fault")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting checkpoint fault")

    def do_gpcheckcat(self,
                      dbname=None,
                      alldb=False,
                      online=False,
                      outputFile='checkcat.out',
                      outdir=None):
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
        self.dbstate.check_catalog()
        return True

    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''

        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation

        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)

        self.dbstate.check_mirrorintegrity()

    def inject_and_resume_fault(self, fault_name, trans_state):
        self.check_fault_status(fault_name, 'triggered')
        self.filerep_fault(trans_state)
        if trans_state == 'failover_to_mirror':
            PSQL.run_sql_file(local_path('test_while_ct.sql'))
        self.resume_faults(fault_name, trans_state)

    def run_post_sqls(self, fault_name='', trans_state=''):
        PSQL.wait_for_database_up()
        if (trans_state == 'failover_to_primary' or trans_state == ''):
            post_sql = "failover_sql/subt_create_table_ao_post_commit"
        else:
            post_sql = "failover_sql/subt_create_table_ao_post_abort"

        sql_file = post_sql + ".sql"
        ans_file = post_sql + ".ans"
        out_file = post_sql + ".out"

        PSQL.run_sql_file(sql_file=local_path(sql_file),
                          out_file=local_path(out_file))
        diff_res = Gpdiff.are_files_equal(local_path(out_file),
                                          local_path(ans_file))

        if not diff_res:
            self.fail("[STLRTest]Gpdiff failed for : %s %s" %
                      (fault_name, trans_state))

    def reset_all_faults(self):
        ''''
        @summary : Reset all faults on primary and mirror 
        '''
        tinctest.logger.info("[STLRTest] Running reset_all_faults")
        self.util = Filerepe2e_Util()

        (ok1, out1) = self.util.inject_fault(f='all',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting all faults on primary")

        (ok1, out1) = self.util.inject_fault(f='all',
                                             m='async',
                                             y='reset',
                                             r='mirror',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info(
            "[STLRTest]Done resetting all faults fault on mirror")

    def kill_zombies(self):
        ''' 
        @summary : There are stray zombie processes running after each test. This method clears them 
        '''
        tinctest.logger.info("[STLRTest] Running kill_zombies")
        cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1"
        cmd = Command("shell_command", cmd_str)
        tinctest.logger.info('Executing command: %s : %s' %
                             ("shell command", cmd_str))
        cmd.run()
        result = cmd.get_results()
        out = result.stdout
        lines = out.split('\n')
        for line in lines:
            pids = line.split('#')
            if pids[0] == '1':
                kill_str = "kill -9 %s" % (pids[1])
                cmd2 = Command("kill_command", kill_str)
                cmd2.run()

    def skip_checkpoint(self):
        ''' 
        @summary : Routine to inject fault that skips checkpointing 
        '''

        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running skip_checkpoint")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='reset',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault")

        (ok1, out1) = self.util.inject_fault(f='checkpoint',
                                             m='async',
                                             y='skip',
                                             r='primary',
                                             H='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault")

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs = Gpfilespace()
        gpfs.create_filespace('subt_filespace_a')

    def cleandb(self):
        db = Database()
        db.setupDatabase('gptest')
Example #22
0
class gpStopTestCase(unittest.TestCase):
	def setUp(self):
		self.basedir = os.path.dirname(__file__)
		self.gphome = os.environ.get('GPHOME')
		self.gp=GpStart()
		self.gps=GpStop()
		self.MAX_TRY=3
                self.TIMEOUT=605
                self.MAXPARALLELSEG=60

    	def test_gpstop_immediate(self):
		tinctest.logger.info("Running test_gpstop_immediate")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(immediate='y')
		self.assertTrue(res)


    	def test_gpstop_getversion(self):
		tinctest.logger.info("Running test_gpstop_getversion")
        	res=self.gps.run_gpstop_cmd(version ='y')
		self.assertTrue(res)

    	def test_gpstop_quiet(self):
		tinctest.logger.info("Running test_gpstop_quiet")
		if self.is_gpdb_running():
		   res=self.gps.run_gpstop_cmd(quietmode='y')
	     	self.assertTrue(res)

    	def test_gpstop_verbose(self):
		tinctest.logger.info("Running test_gpstop_verbose")
		if self.is_gpdb_running():
		   res=self.gps.run_gpstop_cmd(verbose='y')
	     	self.assertTrue(res)

    	def test_gpstop_fast(self):
		tinctest.logger.info("Running test_gpstop_fast")
		if self.is_gpdb_running():
		   res=self.gps.run_gpstop_cmd(fast='y')
	     	self.assertTrue(res)

    	def test_gpstop_smart(self):
		tinctest.logger.info("Running test_gpstop_smart")
		if self.is_gpdb_running():
		   res=self.gps.run_gpstop_cmd(smart='y')
	     	self.assertTrue(res)

        def test_gpStop_masterOnly(self):
		tinctest.logger.info("Running test_gpstop_masteronly")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(masteronly='y')
		self.assertTrue(res)

    	def test_gpstop_restart(self):
		tinctest.logger.info("Running test_gpstop_restart")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(restart='y')
		self.assertTrue(res)

        def test_gpstop_reload(self):
		tinctest.logger.info("Running test_gpstop_reload")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(reload='y')
		self.assertTrue(res)

        def test_gpstop_timeout(self):
		tinctest.logger.info("Running test_gpstop_timeout")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(timeout=self.TIMEOUT)
		self.assertTrue(res)

        def test_gpstop_parallelproc(self):
		tinctest.logger.info("Running test_gpstop_parallelproc")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(parallelproc=self.MAXPARALLELSEG)
		self.assertTrue(res)

        def test_gpstop_notstandby(self):
		tinctest.logger.info("Running test_gpstop_notstandby")
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(notstandby='y')
		self.assertTrue(res)

    	def test_gpstop_logDir(self):
		tinctest.logger.info("Running test_gpstop_logDir")
                self.logdir=''.join([self.basedir,'/logs'])
	        cmd = Command(name='Remove gpstop<nnnn>.log', cmdStr='rm -f %s/gpstop*' % (self.logdir)) 	
		tinctest.logger.info("Removing gpstop<nnnn>.log : %s" % cmd)
		cmd.run(validateAfter=True)
        	result = cmd.get_results()
        	if result.rc != 0 or result.stderr:
                   raise gpstopException("Not able to delete existing gpstop<nnnn>.log")
		lcmd=' '.join(['ls',self.logdir, '| wc -l'])
		res=False
		if self.is_gpdb_running():
        	   res=self.gps.run_gpstop_cmd(logdir=self.logdir)
		if res is not True:
                   raise GPstopError("Error : gpstop_logDir() failed \n")		
		cmd = Command(name='count of  gpstop<nnnn>.log', cmdStr=' %s ' % (lcmd))
		tinctest.logger.info("Count gpstop<nnnn>.log : %s" % cmd)
		cmd.run(validateAfter=True)
        	result = cmd.get_results()
        	if result.rc != 0 or result.stderr:
                   raise gpstopException("Not able to get count of gpstop<nnnn>.log")
		assert int(result.stdout) > 0 

        def is_gpdb_running(self):
		res=False
		ctr=0
		while ctr < self.MAX_TRY: 
		    ctr=ctr+1
	            res=self.gpstartCheck()
                    if res is False:
                        res=self.gp.run_gpstart_cmd(quietmode='y')
		    else:
			break
		if (res is True and ctr < self.MAX_TRY):
			return True
		else:
			return False

        def gpstartCheck(self):
                """
                Checks if the cluster is brought up correctly and all segments are in sync
                """
                bashCmd = 'source ' + (self.gphome)+'/greenplum_path.sh;'+(self.gphome)+'/bin/pg_ctl status -D $MASTER_DATA_DIRECTORY | grep \'pg_ctl: server is running\''
                dbStart = Command(name='gpstartCheck ',cmdStr=bashCmd)
                dbStart.run()
                rc = dbStart.get_results().rc
                if rc != 0:
                        return False
                return True
Example #23
0
class PgtwoPhaseClass(MPPTestCase):
    '''Helper class for pg_twophase supporting functions '''

    def __init__(self,methodName):
        self.filereputil = Filerepe2e_Util()
        self.config = GPDBConfig()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpfile = Gpfilespace(self.config)
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation',self.config)
        self.port = os.getenv('PGPORT')
        super(PgtwoPhaseClass,self).__init__(methodName)

    def invoke_fault(self, fault_name, type, role='mirror', port=None, occurence=None, sleeptime=None, seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.filereputil.inject_fault(f=fault_name, y='reset', r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id)
        self.filereputil.inject_fault(f=fault_name, y=type, r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id)
        tinctest.logger.info('Successfully injected fault_name : %s fault_type : %s' % (fault_name, type))

    def inject_fault(self, fault_type):
        '''
        @param fault_type : type of fault to ne suspended
        '''
        if fault_type == 'end_prepare_two_phase_sleep':
            self.filereputil.inject_fault(f='end_prepare_two_phase_sleep', sleeptime='1000', y='sleep', r='primary', p=self.port)
            tinctest.logger.info('Injected fault to sleep in end_prepare_two_phase')

        elif fault_type == 'abort':
            # In case of abort fault we need to include this error type fault also, to fake a situation where one of the segment is not responding back, which can make the master to trigger an abort transaction
            self.invoke_fault('transaction_abort_after_distributed_prepared', 'error', port=self.port, occurence='0', seg_id='1')

            self.invoke_fault('twophase_transaction_abort_prepared', 'suspend', role='primary', port=self.port, occurence='0')

        elif fault_type == 'commit':
            self.invoke_fault('twophase_transaction_commit_prepared', 'suspend', role='primary', port=self.port, occurence='0')

        elif fault_type == 'dtm_broadcast_prepare':
            self.invoke_fault('dtm_broadcast_prepare', 'suspend', seg_id = '1', port=self.port, occurence='0')

        elif fault_type == 'dtm_broadcast_commit_prepared':
            self.invoke_fault('dtm_broadcast_commit_prepared', 'suspend', seg_id = '1', port=self.port, occurence='0')

        elif fault_type == 'dtm_xlog_distributed_commit':
            self.invoke_fault('dtm_xlog_distributed_commit', 'suspend', seg_id = '1', port=self.port, occurence='0')

    def resume_faults(self, fault_type, cluster_state='sync'):
        '''
        @param fault_type : commit/abort/end_prepare_two_phase_sleep/dtm_broadcast_prepare/dtm_broadcast_commit_prepared/dtm_xlog_distributed_commit
        @description : Resume the suspended faults 
        '''
        tinctest.logger.info('coming to resume faults with xact %s' % fault_type) 
        if fault_type == 'abort':
            self.filereputil.inject_fault(f='twophase_transaction_abort_prepared', y='resume', r='primary', p=self.port , o='0')
            if cluster_state !='resync':
                self.filereputil.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', p=self.port , o='0', seg_id='1')
        elif fault_type == 'commit':
            self.filereputil.inject_fault(f='twophase_transaction_commit_prepared', y='resume', r='primary', p=self.port , o='0')

        elif fault_type == 'dtm_broadcast_prepare':
            self.filereputil.inject_fault(f='dtm_broadcast_prepare', y='resume', seg_id = '1', p=self.port, o='0')

        elif fault_type == 'dtm_broadcast_commit_prepared':
            tinctest.logger.info('coming to if dtm_broadcast_commit_prepared')
            self.filereputil.inject_fault(f='dtm_broadcast_commit_prepared', y='resume', seg_id = '1', p=self.port, o='0')

        elif fault_type == 'dtm_xlog_distributed_commit':
            self.filereputil.inject_fault(f='dtm_xlog_distributed_commit', y='resume', seg_id = '1', p=self.port, o='0')

        else:
            tinctest.logger.info('No faults to resume')
        tinctest.logger.info('Resumed the suspended transaction fault')
        
        #Wait till all the trigger_sqls are complete before returning
        sql_count = PSQL.run_sql_command('select count(*) from pg_stat_activity;', flags ='-q -t', dbname='postgres')
        while(sql_count.strip() != '1'):
            sleep(5)
            sql_count = PSQL.run_sql_command('select count(*) from pg_stat_activity;', flags ='-q -t', dbname='postgres')
            tinctest.logger.info('stat_activity count %s ' % sql_count)
        return

    def start_db(self):
        '''Gpstart '''
        rc = self.gpstart.run_gpstart_cmd()
        if not rc:
            raise Exception('Failed to start the cluster')
        tinctest.logger.info('Started the cluster successfully')

    def stop_db(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()

    def crash_and_recover(self, crash_type, fault_type, checkpoint='noskip', cluster_state='sync'):
        '''
        @param crash_type : gpstop_i/gpstop_a/failover_to_primary/failover_to_mirror
        @note: when skip checkpoint is enabled, gpstop -a returns a non-rc return code and fails in the library. To workaround, using a local function
        '''
        if crash_type == 'gpstop_i' :
            rc = self.gpstop.run_gpstop_cmd(immediate = True)
            if not rc:
                raise Exception('Failed to stop the cluster')
            tinctest.logger.info('Stopped cluster immediately')
            self.start_db()
        elif crash_type == 'gpstop_a':
            self.resume_faults(fault_type, cluster_state)
            if checkpoint == 'skip' :
                self.stop_db()
            else:
                rc = self.gpstop.run_gpstop_cmd()
                if not rc:
                    raise Exception('Failed to stop the cluster')
            tinctest.logger.info('Smart stop completed')
            self.start_db()                            
        elif crash_type == 'failover_to_primary':
            self.invoke_fault('filerep_consumer', 'fault')
            self.resume_faults(fault_type, cluster_state)
            (rc, num) =self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Value of rc and num_down %s, %s, %s' % (rc, num, fault_type))

        elif crash_type == 'failover_to_mirror':
            self.invoke_fault('postmaster', 'panic', role='primary')
            if fault_type in ('dtm_broadcast_prepare', 'dtm_broadcast_commit_prepared', 'dtm_xlog_distributed_commit') :
                self.resume_faults(fault_type, cluster_state)
            (rc, num) = self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Value of rc and num_down %s, %s' % (rc, num))
            if fault_type == 'abort' :
                self.filereputil.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset',p=self.port , o='0', seg_id='1')

        if cluster_state == 'resync':
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')                        

    def get_trigger_status_old(self, trigger_count):
        '''Compare the pg_stat_activity count with the total number of trigger_sqls executed '''
        for i in range(1,50):
            psql_count = PSQL.run_sql_command('select count(*) from pg_stat_activity;', flags='-q -t', dbname='postgres')
        tinctest.logger.info('Count of trigger sqls %s' % psql_count)
        if int(psql_count.strip()) < trigger_count :
            tinctest.logger.info('coming to the if loop in get_trigger_status')
            return False
        return True

    def get_trigger_status(self, trigger_count, fault_type):
        if fault_type == None:
            return self.get_trigger_status_old(trigger_count);

        return self.filereputil.check_fault_status(fault_name=fault_type, status="triggered", seg_id='1', num_times_hit=trigger_count);

    def check_trigger_sql_hang(self, test_dir, fault_type = None):
        '''
        @description : Return the status of the trigger sqls: whether they are waiting on the fault 
        Since gpfaultinjector has no way to check if all the sqls are triggered, we are using 
        a count(*) on pg_stat_activity and compare the total number of trigger_sqls
        '''
        trigger_count=0
        for dir in test_dir.split(","):
            trigger_dir = local_path('%s/trigger_sql/sql/' % (dir))
            trigger_count += len(glob.glob1(trigger_dir,"*.sql"))
        tinctest.logger.info('Total number of sqls to trigger %d in %s' % (trigger_count,test_dir));
        return self.get_trigger_status(trigger_count, fault_type)


    def run_faults_before_pre(self, cluster_state):
        '''
        @param cluster_state : sync/change_tracking/resync
        @description: 1. Cluster into change_tracking in case of resync/ change_tracking. 
        '''
        if cluster_state == 'resync':
            self.invoke_fault('filerep_consumer', 'fault')
            self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Change_tracking transition complete')

    def run_faults_before_trigger(self, checkpoint, cluster_state, fault_type):
        '''
        @param checkpoint : skip/noskip
        @param cluster_state : sync/change_tracking/resync
        @param fault_type : commit/abort
        @param end_prepare_two_phase_sleep : True/False
        @description : 1. Suspend resync faults. 2. Issue Checkpoint before the skip checkpoint, so that the bufferpool is cleared. 3. If skip issue 'skip checkpoint'. 4. Suspend transaction_faults based on test_type.
        '''
        if cluster_state == 'change_tracking':
            self.invoke_fault('filerep_consumer', 'fault')
            self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Change_tracking transition complete')

        if cluster_state == 'resync':
            self.invoke_fault('filerep_resync', 'suspend', role='primary')

            if checkpoint == 'skip':
                self.invoke_fault('filerep_transition_to_sync_before_checkpoint', 'suspend', role='primary', port=self.port, occurence='0')
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecvoerseg failed')
            tinctest.logger.info('Cluster in resync state')

        PSQL.run_sql_command('CHECKPOINT;', dbname='postgres')
        if checkpoint == 'skip':
            self.invoke_fault('checkpoint', 'skip', role='primary', port= self.port, occurence='0')
        self.inject_fault(fault_type)

        if cluster_state == 'resync':
            self.filereputil.inject_fault(f='filerep_resync', y='resume', r='primary')

        PSQL.wait_for_database_up();

    def run_crash_and_recover(self, crash_type, fault_type, test_dir, cluster_state='sync', checkpoint='noskip'):
        '''
        @param crash_type : gpstop_i/gpstop_a/failover_to_mirror/failover_to_primary
        @param fault_type : commit/abort/end_prepare_two_phase_sleep
        @param test_dir : dir of the trigger sqls
        @description : Execute the specified crash type before/after resuming the suspended fault and recover
        '''
        trigger_status = self.check_trigger_sql_hang(test_dir)
        tinctest.logger.info('trigger_status %s' % trigger_status)
        sleep(50) # This sleep is needed till we get a way to find the state of all suspended sqls
        if trigger_status == True:
            if cluster_state == 'resync':
                self.filereputil.inject_fault(f='filerep_transition_to_sync_before_checkpoint', y='resume', r='primary')
                sleep(15) # wait little before crash
            self.crash_and_recover(crash_type, fault_type, checkpoint, cluster_state)
        else:
            tinctest.logger.info('The fault_status is not triggered')
    
    def gprecover_rebalance(self):
        '''
        @description: Run gprecoverseg -r. If rc is not '0' rerun gprecoverseg -a/This is due to known open issues
        '''
        cmd = Command(name='Run gprecoverseg', cmdStr='gprecoverseg -r -a')
        tinctest.logger.info('Running %s' % cmd.cmdStr)
        cmd.run(validateAfter=False)
        result = cmd.get_results()
        if result.rc != 0:
            rc = self.gprecover.incremental()
            if rc:
                return True
        else:
            return True
        return False

    def run_gprecover(self, crash_type, cluster_state='sync'):
        '''Recover the cluster if required. '''
        if crash_type in ('failover_to_primary', 'failover_to_mirror') or cluster_state == 'change_tracking' :
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecvoerseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')                        
            tinctest.logger.info('Cluster in sync state')
            if crash_type == 'failover_to_mirror' :
                #rc = self.gprecover.rebalance()
                # -r has issues occasionally, may need another gprecoverseg, so using a local function
                rc = self.gprecover_rebalance()
                if not rc:
                    raise Exception('Rebalance failed')
                if not self.gprecover.wait_till_insync_transition():
                    raise Exception('Segments not in sync')                        
                tinctest.logger.info('Successfully Rebalanced the cluster')
    
        else:
            tinctest.logger.info('No need to run gprecoverseg. The cluster should be already in sync')


    def switch_ckpt_faults_before_trigger(self, cluster_state, fault_type):
        '''
        @param cluster_state : sync/change_tracking/resync
        @param fault_type : dtm_broadcast_prepare/dtm_broadcast_commit_prepared/dtm_xlog_distributed_commit
        '''
        if cluster_state in ('change_tracking', 'resync'):
            self.invoke_fault('filerep_consumer', 'fault')
            self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Change_tracking transition complete') 

        if cluster_state == 'resync':
            self.invoke_fault('filerep_resync', 'suspend', role='primary')
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecvoerseg failed')
            tinctest.logger.info('Cluster in resync state')
        self.inject_fault(fault_type)

    def switch_ckpt_switch_xlog(self):
        '''
        @description: pg_switch_xlog on segments
        '''
        sql_cmd = 'select * from pg_switch_xlog();'
        num_primary = self.config.get_countprimarysegments()
        for i in range(num_primary):
            (host, port) = self.config.get_hostandport_of_segment(psegmentNumber=i)
            PSQL.run_sql_command_utility_mode(sql_cmd, host = host, port = port)

    def switch_checkpoint_loop(self, fault_type):
        '''     
        @description: Run switch_xlog and checkpoint based on the fault_type
        '''     
        if fault_type == 'dtm_xlog_distributed_commit':
            self.switch_ckpt_switch_xlog()
        else:
            for i in range(5):
                self.switch_ckpt_switch_xlog()

    def switch_ckpt_crash_and_recover(self, crash_type, fault_type, test_dir, cluster_state='sync', checkpoint='noskip'):
        '''
        @param crash_type : gpstop_i/gpstop_a/failover_to_mirror/failover_to_primary
        @param fault_type : dtm_broadcast_prepare/dtm_broadcast_commit_prepared/dtm_xlog_distributed_commit
        @param test_dir : dir of the trigger_sqls
        '''
        trigger_status = self.check_trigger_sql_hang(test_dir, fault_type)
        tinctest.logger.info('trigger_status %s' % trigger_status)

        if trigger_status == True:
            if cluster_state == 'resync':
                self.filereputil.inject_fault(f='filerep_resync', y='resume', r='primary')
                sleep(30) #Give a little time before crash.
            self.crash_and_recover(crash_type, fault_type, checkpoint, cluster_state)
        else:
            tinctest.logger.info('The fault_status is not triggered')
    
   
    def cleanup_dangling_processes(self):
        '''
        @description: Since the test suspend transactions at different stages and does immediate shutdown, 
        few processes will not be cleaned up and eventually will eat up on the system resources
        This methods takes care of killing them at the end of each test, if such processes exists
        '''

        num_primary = self.config.get_countprimarysegments()
        for i in range(num_primary):
            (host, port) = self.config.get_hostandport_of_segment(psegmentNumber=i)
            grep_cmd = "ps -ef|grep %s|grep 'Distributed'" % port
            cmd = Command('Check for dangling process', cmdStr = 'gpssh -h %s -e "%s" ' % (host, grep_cmd))
            cmd.run()
            result = cmd.get_results()
            if len(result.stdout.splitlines()) > 2 :
                grep_and_kill_cmd = "ps -ef|grep %s|grep 'Distributed'|awk '{print \$2}'|xargs kill -9" % port
                cmd = Command('Kill dangling processes', cmdStr='gpssh -h %s -e "%s" ' % (host, grep_and_kill_cmd ))
                cmd.run()
                tinctest.logger.info('Killing the dangling processes') 
Example #24
0
class SuspendCheckpointCrashRecovery(MPPTestCase):
    
    def __init__(self,methodName):
        self.fileutil = Filerepe2e_Util()
        self.config = GPDBConfig()
        self.gprecover = GpRecover(self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        self.gpfile = Gpfilespace(self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')
        self.base = GPDBStorageBaseTestCase()
        super(SuspendCheckpointCrashRecovery,self).__init__(methodName)

    def check_system(self):
        ''' 
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        cmd ="select count(*) from gp_segment_configuration where content<> -1 ;"
        count_all = PSQL.run_sql_command(cmd, flags ='-q -t', dbname='postgres')
        cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        count_up_and_sync = PSQL.run_sql_command(cmd, flags ='-q -t', dbname='postgres')
        if count_all.strip() != count_up_and_sync.strip() :
            os._exit(1)
        else:
            tinctest.logger.info("\n Starting New Test: System is up and in sync .........")

    def get_items_list(self, tests):
        ''' Get file contents to a list '''
        test_file = local_path(tests)
        with open(test_file, 'r') as f:
            test_list = [line.strip() for line in f]
        return test_list

    def checkPSQLRun(self, test):
        '''Check if the psql run started in background is over before running the _post.sql '''
        cmd_str = "ps -ef|grep '%s'|grep [p]sql" % test
        while(1):
            is_running = 0 
            cmd = Command('Check psql run', cmd_str)
            cmd.run()
            result = cmd.get_results()
            for line in result.stdout.splitlines():
                if '%s' %test in line:
                    tinctest.logger.info(line)
                    is_running = 1 
            if is_running == 0:
                return True
            else:
                sleep(5)
        return False

    def modify_sql_file(self, filename):
        ans_file = local_path(filename.replace('.sql' , '.ans'))
        for sfile in (filename, ans_file):
            for line in fileinput.FileInput(sfile,inplace=1):
                line = re.sub('gptest', os.getenv('PGDATABASE'), line)
                print str(re.sub('\n','',line))

    def validate_sql(self, filename):
        ''' Compare the out and ans files '''
        out_file = local_path(filename.replace(".sql", ".out"))
        ans_file = local_path(filename.replace('.sql' , '.ans'))
        assert Gpdiff.are_files_equal(out_file, ans_file)

    def run_sql(self, filename):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(sql_file = filename, out_file = out_file)
        self.validate_sql(filename)

    def set_faults_before_executing_pre_sqls(self, cluster_state):
        ''' Set the checkpoint skip fault '''
        if cluster_state == 'change_tracking':
           self.cluster_in_change_tracking()
        self.fileutil.inject_fault(f='checkpoint', y='reset', r='primary', p=self.port)
        self.fileutil.inject_fault(f='checkpoint', y='skip', r='primary', p=self.port, o='0')
        tinctest.logger.info('Successfully injected fault to skip checkpointing') 
        if(cluster_state == 'resync'):
            self.fileutil.inject_fault(f='filerep_consumer', y='reset')
            self.fileutil.inject_fault(f='filerep_consumer', y='fault')
            self.fileutil.wait_till_change_tracking_transition()

    def suspend_fault(self, fault_name):
        ''' Suspend the provided fault_name '''
        self.fileutil.inject_fault(f='%s' % fault_name, y='reset', o='0', r='primary', p=self.port)
        self.fileutil.inject_fault(f='%s' % fault_name, y='suspend', o='0', r='primary', p=self.port)
        tinctest.logger.info('Successfully injected fault to suspend %s' % fault_name)

    def get_faults_before_executing_trigger_sqls(self, pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False):
        ''' Get the fault before trigger sqls are executed '''
        fault_name=''
        tinctest.logger.info('Fault Conditions: pass_num = [%s], cluster_state = [%s], test_type =  [%s], ddl_type = [%s], aborting_create_needed = [%s]' % (pass_num, cluster_state, test_type, ddl_type, aborting_create_needed)) 

        if pass_num == 1 and test_type == 'commit' and ddl_type == 'create':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_commit_pass1_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_commit_pass1_from_create_pending_to_created'
                
        elif pass_num == 2 and test_type == 'commit' and ddl_type == 'create':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_commit_pass2_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_commit_pass2_from_create_pending_to_created'

        elif pass_num == 1 and test_type == 'commit' and ddl_type == 'drop':
            fault_name = 'finish_prepared_transaction_commit_pass1_from_drop_in_memory_to_drop_pending'

        elif pass_num == 2 and test_type == 'commit' and ddl_type == 'drop':
            fault_name = 'finish_prepared_transaction_commit_pass2_from_drop_in_memory_to_drop_pending'

        elif pass_num == 1 and test_type == 'abort':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_abort_pass1_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_abort_pass1_from_create_pending_to_aborting_create'

        elif pass_num == 2 and test_type == 'abort':
            if aborting_create_needed:
                fault_name = 'finish_prepared_transaction_abort_pass2_aborting_create_needed'
            else:
                fault_name = 'finish_prepared_transaction_abort_pass2_from_create_pending_to_aborting_create'

        elif pass_num == 0 and (test_type == 'abort' or test_type == 'commit'):
            pass # We already set the fault error_txn_abort_after_dist_prepare_on_master above for abort tests and for commit tests skip checkpoint is done by default for all tests.
        return fault_name

    def set_faults_before_executing_trigger_sqls(self, pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False):
        ''' Set the fault before trigger sqls are executed '''
        if (cluster_state == 'resync'):
            self.cluster_in_resync()
        fault_name=''
        fault_name = self.get_faults_before_executing_trigger_sqls(pass_num,cluster_state, test_type, ddl_type, aborting_create_needed=False);

        if (test_type == 'abort'):
            self.fileutil.inject_fault(f='transaction_abort_after_distributed_prepared', y='reset', p=self.port, o='0', seg_id=1)
            self.fileutil.inject_fault(f='transaction_abort_after_distributed_prepared', y='error', p=self.port, o='0', seg_id=1)
            tinctest.logger.info('Successfully injected fault to error out after distributed prepare for abort tests')

        if pass_num !=0 :
            self.suspend_fault(fault_name)
        elif pass_num == 0 : 
            fault_name = None
        if (cluster_state == 'resync'):
            self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y = 'reset', r = 'primary')
            self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y = 'suspend', r = 'primary')
            tinctest.logger.info('Successfully suspended filerep_transition_to_sync_begin')
            #Resume resync so that trigger sql can execute while resync is in progress
            self.fileutil.inject_fault(f='filerep_resync', y = 'resume', r = 'primary')
        return fault_name

    def cluster_in_resync(self):
        '''
        1. Suspend filerep_resync, 2. Suspend filerep_transition_to_sync_before_checkpoint, 3. Run gprecoverseg
        '''
        self.base.invoke_fault('filerep_resync', 'suspend', role='primary')
        self.base.invoke_fault('filerep_transition_to_sync_before_checkpoint', 'suspend', role='primary', port=self.port , occurence='0')
        rc = self.gprecover.incremental()
        if not rc:
            raise Exception('Gprecvoerseg failed')
        tinctest.logger.info('Cluster in resync state')

    def switch_primary_mirror_role_in_utility_mode(self):
        '''Utility routine to start the master, connect in utility mode, switch the roles of primary and mirrors and shutdown the master '''
        cmd = Command('Start master in utility mode', 'export GPSTART_INTERNAL_MASTER_ONLY=1;gpstart -m')
        cmd.run(validateAfter=True)
        result = cmd.get_results()
        if result.rc != 0:
            raise Exception('Unable to start master in utility mode')
        tinctest.logger.info('Started master in utility mode')
    
        sql_cmd_list = ["update gp_segment_configuration set role='t' where role ='p' and content <> -1", "update gp_segment_configuration set role='p',mode='c' where role ='m' and content <> -1", "update gp_segment_configuration set role='m',status='d' where role ='t' and content <> -1"]
        for sql_cmd in sql_cmd_list:
            PSQL.run_sql_command(sql_cmd, PGOPTIONS="-c gp_session_role=utility -c allow_system_table_mods=dml")
        tinctest.logger.info('Updated the catalog to reverse the roles')
        rc = self.gpstop.run_gpstop_cmd(masteronly = True)
        if not rc:
            raise Exception('Failure to shut down the master')

    def stop_db(self):
        ''' gpstop immediate'''
        rc = self.gpstop.run_gpstop_cmd(immediate = True)
        if not rc:
            raise Exception('Failed to stop the cluster')
        tinctest.logger.info('Stopped cluster immediately')
    
    def start_db(self, down_segments=False):
        ''' Gpstart -a '''
        rc = self.gpstart.run_gpstart_cmd()
        if not rc:
            raise Exception('Failed to start the cluster')
        tinctest.logger.info('Started the cluster successfully')
       
        if not down_segments:
            if self.config.is_down_segments():
                raise Exception('Segments got marked down')

    ''' This is sleep free version based on fault triggered status '''
    def run_crash_and_recovery_fast(self,test_dir, pass_num, cluster_state, test_type, ddl_type, aborting_create_needed=False):
        if pass_num == 0:
            self.wait_till_all_sqls_done()
        else:
            mydir=local_path(test_dir)+'/trigger_sql/sql/'
            tinctest.logger.info('mydir = %s ' % mydir)
            trigger_count = len(glob.glob1(mydir,"*trigger.sql"))
            tinctest.logger.info('*** Count of trigger : %s *** ' % (trigger_count))
            if test_dir == "abort_create_tests":
               ''' vacuum full sql don't hit the suspend fault.'''
               trigger_count = trigger_count - 1
            if test_dir == "abort_create_needed_tests":
                ''' Not all SQLs hit the fault for this case, hence wait for them to complete and then others to hit the fault'''
                self.wait_till_all_sqls_done(8 + 1)
                trigger_count = 8
            if test_dir == "abort_abort_create_needed_tests":
                ''' Not all SQLs hit the fault for this case, hence wait for them to complete and then others to hit the fault'''
                self.wait_till_all_sqls_done(6 + 1)
                trigger_count = 6
            fault_type = self.get_faults_before_executing_trigger_sqls(pass_num, cluster_state, test_type, ddl_type, aborting_create_needed=False)
            fault_hit = self.fileutil.check_fault_status(fault_name=fault_type, status="triggered", num_times_hit=trigger_count)
            if not fault_hit:
               raise Exception('Fault not hit expected number of times')

        self.stop_start_validate(cluster_state)

    def wait_till_all_sqls_done(self, count=1):
        ''' 500 here is just an arbitrarily long time "if-we-exceed-this-then-oh-crap-lets-error-out" value '''
        for i in range(1,500):
            psql_count = PSQL.run_sql_command("select count(*) from pg_stat_activity where current_query <> '<IDLE>'", flags='-q -t', dbname='postgres')
            if int(psql_count.strip()) <= count :
                return
            sleep(1)
        raise Exception('SQLs expected to complete but are still running')

    def stop_start_validate(self, cluster_state):
        ''' Do gpstop immediate, gpstart and see if all segments come back up fine '''
        if cluster_state == 'sync' :
            self.stop_db()
            self.switch_primary_mirror_role_in_utility_mode()
            tinctest.logger.info('Successfully switched roles of primary and mirrors in gp_segment_configuration')
            self.start_db(down_segments=True)
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecoverseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
        if cluster_state == 'change_tracking':
            self.stop_db()
            self.start_db(down_segments=True)

        if cluster_state == 'resync':
            #Resume the filerep_resync filerep_transition_to_sync_begin before stop-start
            self.fileutil.inject_fault(f='filerep_transition_to_sync_begin', y='resume', r='primary')
            self.stop_db()
            self.start_db()
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
        self.dbstate.check_catalog(alldb=False)

    def cluster_in_change_tracking(self):
        '''
        Put Cluster into change_tracking
        '''
        self.base.invoke_fault('filerep_consumer', 'fault', role='primary')
        self.fileutil.wait_till_change_tracking_transition()
        tinctest.logger.info('Change_tracking transition complete')


    def validate_system(self, cluster_state):
        # Validate the system's integrity
        if (cluster_state == 'change_tracking'):
            if not self.gprecover.incremental():
                raise Exception('Gprecoverseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
            tinctest.logger.info('Segments recovered and back in sync')

        self.dbstate.check_mirrorintegrity()
        if self.config.has_master_mirror():
            self.dbstate.check_mirrorintegrity(master=True)

    def run_fault_injector_to_skip_checkpoint(self):
        tinctest.logger.info('Skip Checkpointing using fault injector.')
        self.fileutil.inject_fault(y = 'reset', f = 'checkpoint', r ='primary', H='ALL', m ='async', o = '0', p=self.port)
        (ok, out) = self.fileutil.inject_fault(y = 'skip', f = 'checkpoint', r ='primary', H='ALL', m ='async', o = '0', p=self.port)
        if not ok:
           raise Exception('Problem with injecting fault.')

    def backup_output_dir(self,test_dir, test_id):
        indir=local_path(test_dir)
        outdir = indir+'_'+test_id
        cmdstr="cp -r "+ indir + " " + outdir
        cmd = Command(name='run cp -r ', cmdStr=cmdstr)
        tinctest.logger.info("Taking a backup of SQL directory: %s" %cmd)
        try:
            cmd.run()
        except:
            self.fail("cp -r failed.")
        tinctest.logger.info("Test SQL directory Backup Done!!")

    def do_post_run_checks(self):
        self.stop_start_validate('sync')

        rc = self.gprecover.incremental()
        if not rc:
            raise Exception('Gprecvoerseg failed')

        self.gprecover.wait_till_insync_transition()

        tinctest.logger.info("Done going from resync to insync")
        self.dbstate.check_catalog(alldb=False)
        self.dbstate.check_mirrorintegrity()

        if self.config.has_master_mirror():
            self.dbstate.check_mirrorintegrity(master=True)
Example #25
0
class FilerepTestCase(MPPTestCase):

    def __init__(self, methodName):    
        self.pgport = os.environ.get('PGPORT')
        self.util = Filerepe2e_Util()
        self.gpconfig = GpConfig()
        self.config = GPDBConfig()
        self.gpr = GpRecover(self.config)
        self.dbstate = DbStateClass('run_validation',self.config)
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FilerepTestCase,self).__init__(methodName)

    def sleep(self, seconds=60):
        time.sleep(seconds)

    def create_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('create a file', 'touch %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def remove_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('remove a file', 'rm %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def get_timestamp_of_file_in_datadir(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check timestamp', """ python -c "import os; print os.stat('%s').st_mtime" """ %
                      file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)
        res = cmd.get_results().stdout.strip()
        return res

    def verify_file_exists(self, content, role, filename):
        dbid = self.config.get_dbid(content=content, seg_role=role)
        host, datadir = self.config.get_host_and_datadir_of_segment(dbid=dbid)
        file_path = os.path.join(datadir, filename)
        cmd = Command('check if file exists', 'test -f %s' % file_path, ctxt=REMOTE, remoteHost=host)
        cmd.run(validateAfter=True)

    def handle_ext_cases(self,file):
        """
        @file: wet sql file to replace with specific machine env.
        """

        host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP
        querystring = "gpfdist://"+host+":8088"
        
        if os.path.isfile(file):
            for line in fileinput.FileInput(file,inplace=1):
               line = re.sub('gpfdist.+8088',querystring,line)
               print str(re.sub('\n','',line))

    def handle_hybrid_part_cases(self, file):
        """
        @file: hybrid sql file to replace with specific machine env
        """

        querystring = "FROM '"+local_path('hybrid_part.data')+"'" 
        if os.path.isfile(file):
            for line in fileinput.FileInput(file,inplace=1):
                line = re.sub('FROM\s\'.+hybrid_part.data\'',querystring,line)
                print str(re.sub('\n','',line))


    def preprocess(self):
        """ 
        Replace the hard-coded information from sql files with correct hostname and ip address,etc 
        """

        list_workload_dir = ['set_sync1','sync1','set_ck_sync1','ck_sync1',
                        'set_ct','ct','set_resync','resync','set_sync2','sync2']
        for dir in list_workload_dir:
            sql_path = os.path.join(local_path(dir),'sql')
            ans_path = os.path.join(local_path(dir),'expected')
            for file in os.listdir(sql_path):
                    if (file.find('wet_ret')>=0):
                       self.handle_ext_cases(os.path.join(sql_path,file))
                    if (file.find('hybrid_part')>=0):
                       self.handle_hybrid_part_cases(os.path.join(sql_path,file))  
            for file in os.listdir(ans_path):
                    if (file.find('wet_ret')>=0):
                       self.handle_ext_cases(os.path.join(ans_path,file))
                    if (file.find('hybrid_part')>=0):
                       self.handle_hybrid_part_cases(os.path.join(ans_path,file)) 


    def clean_data(self):
        """ 
        Clean the data by removing the external table, otherwise, more data will be appended to the
        same external table from running multiple sql files. 
        """  

        test = local_path("")
        test = str(test) +"data/*.*"
    
        cmd = 'rm -rfv '+test
        run_shell_command(cmd)       

    def anydownsegments(self):
        """
        checks if any segments are down
        """        

        tinctest.logger.info("Checking if any segments are down")
        num_segments_down = self.count_of_nodes_down()
        if int(num_segments_down) == 0:
           return True
        else:
           return False

    def stop_start_validate(self, stopValidate=True):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        

        tinctest.logger.info("Performing stop start validate")
        tinctest.logger.info("Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i', validate=stopValidate)
        if not ok and stopValidate:
           raise Exception('Problem while shutting down the cluster')
        tinctest.logger.info("Successfully shutdown the cluster.")

        tinctest.logger.info("Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failed to bring the cluster back up')
        tinctest.logger.info("Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("segments were marked down")
        else:
           return (True, "All segments are up")


    def method_reset_fault_injection(self):
        """
        Resets fault injection
        Return: (True, [result]) if OK, or (False, [result]) otherwise
        """        

        tinctest.logger.info("Resetting fault injection")
        
        (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL')
        if not ok1:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done Injecting Fault  to reset resync")

        return (True, str(out1))


    def method_resume_filerep_resync(self):
        """
        Resumes the process of resync
        """

        tinctest.logger.info("Resuming Resync")
        (ok, out) = self.util.inject_fault(f='filerep_resync', m='async',y='resume', r='primary', H='ALL')
        if not ok:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done resuming resync")
        return (ok, out)

    def run_method_suspendresync(self):
        """
        Stops the cluster from going to resync
        """

        tinctest.logger.info("Suspending resync")
        (ok,out) = self.util.inject_fault(f='filerep_resync', m='async' , y='suspend', r ='primary', H='ALL')
        tinctest.logger.info('output from suspend resync %s'%out)
        if not ok:
            raise Exception("Fault injection failed")   
        tinctest.logger.info("Done Injecting Fault to suspend resync")
        return (ok, out)
      

    def count_of_masters(self):
        """
        Gives count of number of nodes in the cluster that are master 
        Return: count of number of nodes in the cluster that are master
        """

        tinctest.logger.info("Count the number of masters")
        cmd = "select count(*) from gp_segment_configuration where content = -1"
        (out) = PSQL.run_sql_command(cmd)
        num_master = out.split('\n')[3].strip()
        return num_master 


    def count_of_nodes(self):
        """
        Gives count of number of nodes in the cluster
        Return: count of number of nodes in the cluster
        """

        tinctest.logger.info("Counting number of nodes")
        cmd = "select count(*) from gp_segment_configuration"
        (num_cl) = PSQL.run_sql_command(cmd)
        total_num_rows = num_cl.split('\n')[3].strip()
        return total_num_rows


    def count_of_nodes_in_ct(self):
        """
        Gives count of number of nodes in change tracking
        Return: count of number of nodes in change tracking
        """

        tinctest.logger.info("Counting number of nodes in ct")
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 'c'"
        (num_cl) = PSQL.run_sql_command(sqlcmd)
        num_cl = num_cl.split('\n')[3].strip()
        return num_cl


    def count_of_nodes_down(self):
        """
        Gives count of number of nodes marked as down
        Return: count of number of nodes marked as down
        """

        tinctest.logger.info("Counting the number of nodes down")
        sqlcmd = "select count(*) from gp_segment_configuration where status = 'd'"
        (num_down) = PSQL.run_sql_command(sqlcmd)
        num_down = num_down.split('\n')[3].strip()
        return num_down    


    def count_of_nodes_sync(self):
        """
        Gives count of number of nodes in sync
        Return: count of number of nodes in sync
        """

        tinctest.logger.info("Counting the number of nodes in sync")        
        sqlcmd = "select count(*) from gp_segment_configuration where mode = 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync


    def count_of_nodes_not_sync(self):
        """
        Gives count of number of nodes not in sync
        Return: count of number of nodes not in sync
        """

        tinctest.logger.info("Counting number of nodes not in sync")
        sqlcmd = "select count(*) from gp_segment_configuration where mode <> 's'"
        (num_sync) = PSQL.run_sql_command(sqlcmd)
        num_sync = num_sync.split('\n')[3].strip()
        return num_sync

    def inject_fault_on_first_primary(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        tinctest.logger.info("\n Injecting faults on first primary")
        (ok,out) = self.util.inject_fault(f='filerep_immediate_shutdown_request', m='async' , y='infinite_loop', r ='primary', seg_id=2, sleeptime=300)
        if not ok:
            raise Exception("Fault filerep_immediate_shutdown_request injection failed")   

        (ok,out) = self.util.inject_fault(f='fileRep_is_operation_completed', m='async' , y='infinite_loop', r ='primary', seg_id=2)
        if not ok:
            raise Exception("Fault fileRep_is_operation_completed injection failed")   
        tinctest.logger.info("\n Done Injecting Fault")


    def inject_fault_on_first_mirror(self):
        """
	@product_version gpdb:[4.3.3.0-], gpdb:[4.2.8.1-4.2]
        """
        sqlcmd = "select dbid from gp_segment_configuration where content=0 and role='m'"
        (first_mirror_dbid) = PSQL.run_sql_command(sqlcmd)
        first_mirror_dbid = first_mirror_dbid.split('\n')[3].strip()

        tinctest.logger.info("\n Injecting faults on first mirror")
        flag = self.util.check_fault_status(fault_name='fileRep_is_operation_completed', status='triggered', max_cycle=100);
        if not flag:
            raise Exception("Fault fileRep_is_operation_completed didn't trigger")   
 
        (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='panic', r ='mirror', seg_id=first_mirror_dbid)
        if not ok:
            raise Exception("Fault filerep_consumer injection failed")   
        tinctest.logger.info("\n Done Injecting Fault")

    def setupGpfdist(self, port, path):
        gpfdist = Gpfdist(port , self.hostIP())
        gpfdist.killGpfdist()
        gpfdist.startGpfdist(' -t 30 -m 1048576 -d '+path)
        return True

    def cleanupGpfdist(self, port,path):
        gpfdist = Gpfdist(port , self.hostIP())
        gpfdist.killGpfdist()
        return True

    def hostIP(self):
        ok = run_shell_command('which gpfdist')
        if not ok:
            raise GPtestError("Error:'which gpfdist' command failed.")
        hostname = socket.gethostname()
        if hostname.find('mdw') > 0 :
            host = 'mdw'
        else:
            host = str(socket.gethostbyname(socket.gethostname())) #Must be an IP
        tinctest.logger.info('current host is %s'%host)
        return host

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs=Gpfilespace()
        gpfs.create_filespace('filerep_fs_a')
        gpfs.create_filespace('filerep_fs_b')
        gpfs.create_filespace('filerep_fs_c')
        gpfs.create_filespace('filerep_fs_z')
        gpfs.create_filespace('sync1_fs_1') 
 
        # Set max_resource_queues to 100 
        cmd = 'gpconfig -c max_resource_queues -v 100 '
        ok = run_shell_command(cmd)
        if not ok:
            raise Exception('Failure during setting the max_resource_queues value to 100 using gpconfig tool')
        #Restart the cluster
        self.gpstop.run_gpstop_cmd(immediate = 'i')
        ok = self.gpstart.run_gpstart_cmd()
        if not ok:
            raise Exception('Failure during restarting the cluster')
        return True


    def get_ext_table_query_from_gpstate(self):
        outfile = local_path("gpstate_tmp")
        ok = run_shell_command("gpstate --printSampleExternalTableSql >"+ outfile)
        querystring = ""
        flag = 'false'
        out = open(outfile, 'r').readlines()
        for line in out:
            line.strip()
            if (line.find('DROP EXTERNAL TABLE IF EXISTS gpstate_segment_status')>=0):
                flag = 'true'
            if flag == 'true':
                querystring = querystring + line
        return querystring ############RUN QYUERY

    def check_gpstate(self, type, phase):
        """ 
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """       

        if phase == 'sync1':
            state_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Synchronized' and status_in_config='Up' and instance_status='Up'")
            sync1_num = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1")
            if int(sync1_num) <> int(state_num):
                raise Exception("gpstate in Sync state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'ct':
            p_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Change Tracking'  and role = 'Primary' and status_in_config='Up' and instance_status='Up'")
            m_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Out of Sync'  and role = 'Mirror' and status_in_config='Down' and instance_status='Down in configuration' ")

            if int(p_num) <> int(m_num):
                raise Exception("gpstate in CT state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'resync_incr':
            
            if type == 'primary':
                query = "select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            else:
                query = "select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing' and  status_in_config='Up' and instance_status='Up' and resync_mode= 'Incremental'"
                resync_incr_num = self.query_select_count(query)
            
            query_num_rows = "select count(*) from gp_segment_configuration where content <> -1"
            num_rows = self.query_select_count(query_num_rows)
            
            if int(resync_incr_num) <> int(num_rows):
                tinctest.logger.info("resync_incr_num query run %s" % query)
                tinctest.logger.info("num_rows query run %s" % query_num_rows)
                raise Exception("gpstate in Resync Incremental  state failed. resync_incr_num %s <> num_rows %s" % (resync_incr_num, num_rows))
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))

        elif phase == 'resync_full':
            num_rows = self.query_select_count("select count(*) from gp_segment_configuration where content <> -1")
          
            if type == 'primary':
                resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where role = preferred_role and mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'")
            else:
                resync_full_num = self.query_select_count("select count(*) from gpstate_segment_status where mirror_status ='Resynchronizing'  and  status_in_config='Up' and instance_status='Up'  and resync_mode= 'Full'")

            if int(resync_full_num) <> int(num_rows):
                raise Exception("gptate in Resync Full state failed")
            tinctest.logger.info("Done Running gpstate in %s phase " %(phase))
        
        return True
    
    def trigger_transition(self):
        PSQL.run_sql_file(local_path('mirrors.sql'))
        

    def run_gpstate(self, type, phase):            
        """
        Perform gpstate for each different transition state
        @type: failover type
        @phase: transition stage, can be sync1, ck_sync1, ct, resync, sync2
        """

        tinctest.logger.info("running gpstate")
        querystring = self.get_ext_table_query_from_gpstate()
        file1 = local_path('create_table_gpstate.sql')
        f1 = open(file1,'w')
        f1.write(querystring)
        f1.write('\n')
        f1.close()
        PSQL.run_sql_file(local_path('create_table_gpstate.sql'))

        gpstate_outfile = local_path('gpstate_out')
        cmd = 'gpstate -s -a > %s 2>&1' % (gpstate_outfile)

        ok  = run_shell_command(cmd)
        self.check_gpstate(type, phase)
        return ok


    def check_mirror_seg(self, master=False):
        tinctest.logger.info("running check mirror")
        self.dbstate.check_mirrorintegrity()

    def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
        tinctest.logger.info("running gpcheckcat")
        self.dbstate.check_catalog(outputFile=outputFile)

    def query_select_count(self,sqlcmd):
        (num) = PSQL.run_sql_command(sqlcmd)
        num = num.split('\n')[3].strip()
        return num
    
    def method_run_failover(self,type):
        """
        Inject fault to failover nodes
        @type: primary [induces fault in mirror] mirror [creates panic in primary]   
        Return: (True, [result of fault injection]) if OK, or (False, [result of fault injection]) otherwise
        """

        if type == 'primary':
            tinctest.logger.info("\n primary failover")
            (ok,out) = self.util.inject_fault(f='filerep_consumer', m='async' , y='fault', r ='mirror', H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")

        elif type == 'mirror':
            tinctest.logger.info("\n Mirror failover")
            (ok,out) = self.util.inject_fault(f='postmaster', m='async' , y='panic', r ='primary', H='ALL')
            tinctest.logger.info("\n Done Injecting Fault")
        return True

    def wait_till_change_tracking_transition(self):
        self.util.wait_till_change_tracking_transition()

    def wait_till_insync_transition(self):
        self.gpr.wait_till_insync_transition()
   
    def run_gprecoverseg(self,recover_mode):
        if recover_mode == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

    def run_gpconfig(self, parameter, master_value, segment_value):
        if (parameter is not None):
            self.gpconfig.setParameter(parameter, master_value, segment_value)
            self.gpstop.run_gpstop_cmd(restart='r')

    def inject_fault(self, fault = None, mode = None, operation = None, prim_mirr = None, host = 'All', table = None, database = None, seg_id = None, sleeptime = None, occurence = None):
        if (fault == None or mode == None or operation == None or prim_mirr == None):
            raise Exception('Incorrect parameters provided for inject fault')

        (ok,out) = self.util.inject_fault(f=fault, m=mode , y=operation, r=prim_mirr, H='ALL', table=table, database=database, sleeptime=sleeptime, o=occurence, seg_id=seg_id)
Example #26
0
class PgtwoPhaseClass(MPPTestCase):
    '''Helper class for pg_twophase supporting functions '''
    def __init__(self, methodName):
        self.filereputil = Filerepe2e_Util()
        self.config = GPDBConfig()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpfile = Gpfilespace(self.config)
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')
        super(PgtwoPhaseClass, self).__init__(methodName)

    def invoke_fault(self,
                     fault_name,
                     type,
                     role='mirror',
                     port=None,
                     occurence=None,
                     sleeptime=None,
                     seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.filereputil.inject_fault(f=fault_name,
                                      y='reset',
                                      r=role,
                                      p=port,
                                      o=occurence,
                                      sleeptime=sleeptime,
                                      seg_id=seg_id)
        self.filereputil.inject_fault(f=fault_name,
                                      y=type,
                                      r=role,
                                      p=port,
                                      o=occurence,
                                      sleeptime=sleeptime,
                                      seg_id=seg_id)
        tinctest.logger.info(
            'Successfully injected fault_name : %s fault_type : %s' %
            (fault_name, type))

    def inject_fault(self, fault_type):
        '''
        @param fault_type : type of fault to ne suspended
        '''
        if fault_type == 'end_prepare_two_phase_sleep':
            self.filereputil.inject_fault(f='end_prepare_two_phase_sleep',
                                          sleeptime='1000',
                                          y='sleep',
                                          r='primary',
                                          p=self.port)
            tinctest.logger.info(
                'Injected fault to sleep in end_prepare_two_phase')

        elif fault_type == 'abort':
            # In case of abort fault we need to include this error type fault also, to fake a situation where one of the segment is not responding back, which can make the master to trigger an abort transaction
            self.invoke_fault('transaction_abort_after_distributed_prepared',
                              'error',
                              port=self.port,
                              occurence='0',
                              seg_id='1')

            self.invoke_fault('twophase_transaction_abort_prepared',
                              'suspend',
                              role='primary',
                              port=self.port,
                              occurence='0')

        elif fault_type == 'commit':
            self.invoke_fault('twophase_transaction_commit_prepared',
                              'suspend',
                              role='primary',
                              port=self.port,
                              occurence='0')

        elif fault_type == 'dtm_broadcast_prepare':
            self.invoke_fault('dtm_broadcast_prepare',
                              'suspend',
                              seg_id='1',
                              port=self.port,
                              occurence='0')

        elif fault_type == 'dtm_broadcast_commit_prepared':
            self.invoke_fault('dtm_broadcast_commit_prepared',
                              'suspend',
                              seg_id='1',
                              port=self.port,
                              occurence='0')

        elif fault_type == 'dtm_xlog_distributed_commit':
            self.invoke_fault('dtm_xlog_distributed_commit',
                              'suspend',
                              seg_id='1',
                              port=self.port,
                              occurence='0')

    def resume_faults(self, fault_type, cluster_state='sync'):
        '''
        @param fault_type : commit/abort/end_prepare_two_phase_sleep/dtm_broadcast_prepare/dtm_broadcast_commit_prepared/dtm_xlog_distributed_commit
        @description : Resume the suspended faults 
        '''
        tinctest.logger.info('coming to resume faults with xact %s' %
                             fault_type)
        if fault_type == 'abort':
            self.filereputil.inject_fault(
                f='twophase_transaction_abort_prepared',
                y='resume',
                r='primary',
                p=self.port,
                o='0')
            if cluster_state != 'resync':
                self.filereputil.inject_fault(
                    f='transaction_abort_after_distributed_prepared',
                    y='reset',
                    p=self.port,
                    o='0',
                    seg_id='1')
        elif fault_type == 'commit':
            self.filereputil.inject_fault(
                f='twophase_transaction_commit_prepared',
                y='resume',
                r='primary',
                p=self.port,
                o='0')

        elif fault_type == 'dtm_broadcast_prepare':
            self.filereputil.inject_fault(f='dtm_broadcast_prepare',
                                          y='resume',
                                          seg_id='1',
                                          p=self.port,
                                          o='0')

        elif fault_type == 'dtm_broadcast_commit_prepared':
            tinctest.logger.info('coming to if dtm_broadcast_commit_prepared')
            self.filereputil.inject_fault(f='dtm_broadcast_commit_prepared',
                                          y='resume',
                                          seg_id='1',
                                          p=self.port,
                                          o='0')

        elif fault_type == 'dtm_xlog_distributed_commit':
            self.filereputil.inject_fault(f='dtm_xlog_distributed_commit',
                                          y='resume',
                                          seg_id='1',
                                          p=self.port,
                                          o='0')

        else:
            tinctest.logger.info('No faults to resume')
        tinctest.logger.info('Resumed the suspended transaction fault')

        #Wait till all the trigger_sqls are complete before returning
        sql_count = PSQL.run_sql_command(
            'select count(*) from pg_stat_activity;',
            flags='-q -t',
            dbname='postgres')
        while (sql_count.strip() != '1'):
            sleep(5)
            sql_count = PSQL.run_sql_command(
                'select count(*) from pg_stat_activity;',
                flags='-q -t',
                dbname='postgres')
            tinctest.logger.info('stat_activity count %s ' % sql_count)
        return

    def start_db(self):
        '''Gpstart '''
        rc = self.gpstart.run_gpstart_cmd()
        if not rc:
            raise Exception('Failed to start the cluster')
        tinctest.logger.info('Started the cluster successfully')

    def stop_db(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()

    def crash_and_recover(self,
                          crash_type,
                          fault_type,
                          checkpoint='noskip',
                          cluster_state='sync'):
        '''
        @param crash_type : gpstop_i/gpstop_a/failover_to_primary/failover_to_mirror
        @note: when skip checkpoint is enabled, gpstop -a returns a non-rc return code and fails in the library. To workaround, using a local function
        '''
        if crash_type == 'gpstop_i':
            rc = self.gpstop.run_gpstop_cmd(immediate=True)
            if not rc:
                raise Exception('Failed to stop the cluster')
            tinctest.logger.info('Stopped cluster immediately')
            self.start_db()
        elif crash_type == 'gpstop_a':
            self.resume_faults(fault_type, cluster_state)
            if checkpoint == 'skip':
                self.stop_db()
            else:
                rc = self.gpstop.run_gpstop_cmd()
                if not rc:
                    raise Exception('Failed to stop the cluster')
            tinctest.logger.info('Smart stop completed')
            self.start_db()
        elif crash_type == 'failover_to_primary':
            self.invoke_fault('filerep_consumer', 'fault')
            self.resume_faults(fault_type, cluster_state)
            (rc, num) = self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Value of rc and num_down %s, %s, %s' %
                                 (rc, num, fault_type))

        elif crash_type == 'failover_to_mirror':
            self.invoke_fault('postmaster', 'panic', role='primary')
            if fault_type in ('dtm_broadcast_prepare',
                              'dtm_broadcast_commit_prepared',
                              'dtm_xlog_distributed_commit'):
                self.resume_faults(fault_type, cluster_state)
            (rc, num) = self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Value of rc and num_down %s, %s' % (rc, num))
            if fault_type == 'abort':
                self.filereputil.inject_fault(
                    f='transaction_abort_after_distributed_prepared',
                    y='reset',
                    p=self.port,
                    o='0',
                    seg_id='1')

        if cluster_state == 'resync':
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')

    def get_trigger_status_old(self, trigger_count):
        '''Compare the pg_stat_activity count with the total number of trigger_sqls executed '''
        for i in range(1, 50):
            psql_count = PSQL.run_sql_command(
                'select count(*) from pg_stat_activity;',
                flags='-q -t',
                dbname='postgres')
        tinctest.logger.info('Count of trigger sqls %s' % psql_count)
        if int(psql_count.strip()) < trigger_count:
            tinctest.logger.info('coming to the if loop in get_trigger_status')
            return False
        return True

    def get_trigger_status(self, trigger_count, fault_type):
        if fault_type == None:
            return self.get_trigger_status_old(trigger_count)

        return self.filereputil.check_fault_status(fault_name=fault_type,
                                                   status="triggered",
                                                   seg_id='1',
                                                   num_times_hit=trigger_count)

    def check_trigger_sql_hang(self, test_dir, fault_type=None):
        '''
        @description : Return the status of the trigger sqls: whether they are waiting on the fault 
        Since gpfaultinjector has no way to check if all the sqls are triggered, we are using 
        a count(*) on pg_stat_activity and compare the total number of trigger_sqls
        '''
        trigger_count = 0
        for dir in test_dir.split(","):
            trigger_dir = local_path('%s/trigger_sql/sql/' % (dir))
            trigger_count += len(glob.glob1(trigger_dir, "*.sql"))
        tinctest.logger.info('Total number of sqls to trigger %d in %s' %
                             (trigger_count, test_dir))
        return self.get_trigger_status(trigger_count, fault_type)

    def run_faults_before_pre(self, cluster_state):
        '''
        @param cluster_state : sync/change_tracking/resync
        @description: 1. Cluster into change_tracking in case of resync/ change_tracking. 
        '''
        if cluster_state == 'resync':
            self.invoke_fault('filerep_consumer', 'fault')
            self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Change_tracking transition complete')

    def run_faults_before_trigger(self, checkpoint, cluster_state, fault_type):
        '''
        @param checkpoint : skip/noskip
        @param cluster_state : sync/change_tracking/resync
        @param fault_type : commit/abort
        @param end_prepare_two_phase_sleep : True/False
        @description : 1. Suspend resync faults. 2. Issue Checkpoint before the skip checkpoint, so that the bufferpool is cleared. 3. If skip issue 'skip checkpoint'. 4. Suspend transaction_faults based on test_type.
        '''
        if cluster_state == 'change_tracking':
            self.invoke_fault('filerep_consumer', 'fault')
            self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Change_tracking transition complete')

        if cluster_state == 'resync':
            self.invoke_fault('filerep_resync', 'suspend', role='primary')

            if checkpoint == 'skip':
                self.invoke_fault(
                    'filerep_transition_to_sync_before_checkpoint',
                    'suspend',
                    role='primary',
                    port=self.port,
                    occurence='0')
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecvoerseg failed')
            tinctest.logger.info('Cluster in resync state')

        PSQL.run_sql_command('CHECKPOINT;', dbname='postgres')
        if checkpoint == 'skip':
            self.invoke_fault('checkpoint',
                              'skip',
                              role='primary',
                              port=self.port,
                              occurence='0')
        self.inject_fault(fault_type)

        if cluster_state == 'resync':
            self.filereputil.inject_fault(f='filerep_resync',
                                          y='resume',
                                          r='primary')

        PSQL.wait_for_database_up()

    def run_crash_and_recover(self,
                              crash_type,
                              fault_type,
                              test_dir,
                              cluster_state='sync',
                              checkpoint='noskip'):
        '''
        @param crash_type : gpstop_i/gpstop_a/failover_to_mirror/failover_to_primary
        @param fault_type : commit/abort/end_prepare_two_phase_sleep
        @param test_dir : dir of the trigger sqls
        @description : Execute the specified crash type before/after resuming the suspended fault and recover
        '''
        trigger_status = self.check_trigger_sql_hang(test_dir)
        tinctest.logger.info('trigger_status %s' % trigger_status)
        sleep(
            50
        )  # This sleep is needed till we get a way to find the state of all suspended sqls
        if trigger_status == True:
            if cluster_state == 'resync':
                self.filereputil.inject_fault(
                    f='filerep_transition_to_sync_before_checkpoint',
                    y='resume',
                    r='primary')
                sleep(15)  # wait little before crash
            self.crash_and_recover(crash_type, fault_type, checkpoint,
                                   cluster_state)
        else:
            tinctest.logger.info('The fault_status is not triggered')

    def gprecover_rebalance(self):
        '''
        @description: Run rebalance through gpstop -air is much faster than gprecoverseg -r for test purpose.
        '''
        rc = self.gpstop.run_gpstop_cmd(immediate=True)
        if not rc:
            raise Exception('Failed to stop the cluster')
        tinctest.logger.info('Stopped cluster immediately')
        self.start_db()

    def run_gprecover(self, crash_type, cluster_state='sync'):
        '''Recover the cluster if required. '''
        if crash_type in ('failover_to_primary', 'failover_to_mirror'
                          ) or cluster_state == 'change_tracking':
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecvoerseg failed')
            if not self.gprecover.wait_till_insync_transition():
                raise Exception('Segments not in sync')
            tinctest.logger.info('Cluster in sync state')
            if crash_type == 'failover_to_mirror':
                self.gprecover_rebalance()
                tinctest.logger.info('Successfully Rebalanced the cluster')
        else:
            tinctest.logger.info(
                'No need to run gprecoverseg. The cluster should be already in sync'
            )

    def switch_ckpt_faults_before_trigger(self, cluster_state, fault_type):
        '''
        @param cluster_state : sync/change_tracking/resync
        @param fault_type : dtm_broadcast_prepare/dtm_broadcast_commit_prepared/dtm_xlog_distributed_commit
        '''
        if cluster_state in ('change_tracking', 'resync'):
            self.invoke_fault('filerep_consumer', 'fault')
            self.filereputil.wait_till_change_tracking_transition()
            tinctest.logger.info('Change_tracking transition complete')

        if cluster_state == 'resync':
            self.invoke_fault('filerep_resync', 'suspend', role='primary')
            rc = self.gprecover.incremental()
            if not rc:
                raise Exception('Gprecvoerseg failed')
            tinctest.logger.info('Cluster in resync state')
        self.inject_fault(fault_type)

    def switch_ckpt_switch_xlog(self):
        '''
        @description: pg_switch_xlog on segments
        '''
        sql_cmd = 'select * from pg_switch_xlog();'
        num_primary = self.config.get_countprimarysegments()
        for i in range(num_primary):
            (host,
             port) = self.config.get_hostandport_of_segment(psegmentNumber=i)
            PSQL.run_sql_command_utility_mode(sql_cmd, host=host, port=port)

    def switch_checkpoint_loop(self, fault_type):
        '''     
        @description: Run switch_xlog and checkpoint based on the fault_type
        '''
        if fault_type == 'dtm_xlog_distributed_commit':
            self.switch_ckpt_switch_xlog()
        else:
            for i in range(5):
                self.switch_ckpt_switch_xlog()

    def switch_ckpt_crash_and_recover(self,
                                      crash_type,
                                      fault_type,
                                      test_dir,
                                      cluster_state='sync',
                                      checkpoint='noskip'):
        '''
        @param crash_type : gpstop_i/gpstop_a/failover_to_mirror/failover_to_primary
        @param fault_type : dtm_broadcast_prepare/dtm_broadcast_commit_prepared/dtm_xlog_distributed_commit
        @param test_dir : dir of the trigger_sqls
        '''
        trigger_status = self.check_trigger_sql_hang(test_dir, fault_type)
        tinctest.logger.info('trigger_status %s' % trigger_status)

        if trigger_status == True:
            if cluster_state == 'resync':
                self.filereputil.inject_fault(f='filerep_resync',
                                              y='resume',
                                              r='primary')
                sleep(30)  #Give a little time before crash.
            self.crash_and_recover(crash_type, fault_type, checkpoint,
                                   cluster_state)
        else:
            tinctest.logger.info('The fault_status is not triggered')

    def cleanup_dangling_processes(self):
        '''
        @description: Since the test suspend transactions at different stages and does immediate shutdown, 
        few processes will not be cleaned up and eventually will eat up on the system resources
        This methods takes care of killing them at the end of each test, if such processes exists
        '''

        num_primary = self.config.get_countprimarysegments()
        for i in range(num_primary):
            (host,
             port) = self.config.get_hostandport_of_segment(psegmentNumber=i)
            grep_cmd = "ps -ef|grep %s|grep 'Distributed'" % port
            cmd = Command('Check for dangling process',
                          cmdStr='gpssh -h %s -e "%s" ' % (host, grep_cmd))
            cmd.run()
            result = cmd.get_results()
            if len(result.stdout.splitlines()) > 2:
                grep_and_kill_cmd = "ps -ef|grep %s|grep 'Distributed'|awk '{print \$2}'|xargs kill -9" % port
                cmd = Command('Kill dangling processes',
                              cmdStr='gpssh -h %s -e "%s" ' %
                              (host, grep_and_kill_cmd))
                cmd.run()
                tinctest.logger.info('Killing the dangling processes')
Example #27
0
class SubTransactionLimitRemovalTestCase(MPPTestCase):

    def __init__(self, methodName):    
        super(SubTransactionLimitRemovalTestCase,self).__init__(methodName)
   
    def check_system(self):
        '''
        @summary: Check whether the system is up and sync. Exit out if not 
        '''
        tinctest.logger.info("[STLRTest] Running check_system")   
        
        tinctest.logger.info("[STLRTest] Check whether the system is up and sync")   
        
        cmd ="select count(*) from gp_segment_configuration where content<> -1 ;"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_all = num_cl.split('\n')[3].strip()
               
        cmd ="select count(*) from gp_segment_configuration where content<> -1 and mode = 's' and status = 'u';"
        (num_cl) = PSQL.run_sql_command(cmd)
        count_up_and_sync = num_cl.split('\n')[3].strip()
        tinctest.logger.info("[STLRTest] printing gp segment configuration")
        (gp_seg_conf) = PSQL.run_sql_command("select * from gp_segment_configuration order by dbid")
        tinctest.logger.info(gp_seg_conf)


        if count_all != count_up_and_sync :
            raise Exception("[STLRTest] System not in sync and up. Exiting test")
        else:
            tinctest.logger.info("[STLRTest] Starting New Test: System is up and in sync...")

    def run_sqls(self,test):
        '''
        @summary : Run the sql 
        @param test: the sql file list
        '''        
        tinctest.logger.info("[STLRTest] Running run_sqls")   
        tinctest.logger.info("[STLRTest]Starting new thread to run sql %s"%(test))
        PSQL.run_sql_file(local_path(test))
            
    def suspend_faults(self,fault_name):
        '''
        @summary : Suspend the specified fault: reset it before issuing suspend 
        @param fault_name : Name of the fault to suspend
        '''
        tinctest.logger.info("[STLRTest] Running suspend_faults")   

        self.util = Filerepe2e_Util()

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting the %s fault"%(fault_name))      

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'suspend', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done suspending the %s fault"%(fault_name))
        
    def check_fault_status(self,fault_name = None, status = None, max_cycle=10):
        ''' 
        Check whether a fault is triggered. Poll till the fault is triggered
        @param name : Fault name
        @param status : Status to be checked - triggered/completed
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running check_fault_status %s", status)   

        if (not fault_name) or (not status) :
            raise Exception("[STLRTest]Need a value for fault_name and status to continue")

        poll =0
        while(poll < max_cycle):
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'status', r = 'primary', H ='ALL')
            poll +=1
            for line in out1.splitlines():
                if line.find(fault_name) > 0 and line.find(status) > 0 :
                    tinctest.logger.info("[STLRTest]Fault %s is %s " % (fault_name,status))
                    poll = 0
                    tinctest.logger.info("[STLRTest] Running check_fault_status %s TRUE", status)
                    return True

            #sleep a while before start polling again
            sleep(10)
        tinctest.logger.info("[STLRTest] Running check_fault_status %s FALSE", status)
        return False
        
        
    def filerep_fault(self,trans_state):
        '''
        @summary : Inject the filerep fault supplied
        @param trans_state : type of transition 
        '''
        tinctest.logger.info("[STLRTest] Running filerep_fault")   
        self.util = Filerepe2e_Util()

        if trans_state == 'failover_to_primary':
            tinctest.logger.info("[STLRTest] primary failover")
            (ok1,out1) = self.util.inject_fault(f='filerep_consumer', m = 'async', y = 'fault', r = 'mirror', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done primary failover fault")

        elif trans_state == 'failover_to_mirror':
            tinctest.logger.info("[STLRTest] fault for postmaster panic")
            (ok1,out1) = self.util.inject_fault(f='postmaster', m = 'async', y = 'panic', r = 'primary', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done postmaster panic fault")

        elif trans_state == 'postmaster_reset':
            tinctest.logger.info("[STLRTest] fault for filerep_sender panic")
            (ok1,out1) = self.util.inject_fault(f='filerep_sender', m = 'async', y = 'panic', r = 'primary', H ='ALL')

            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done filerep_sender panic fault")
            
        tinctest.logger.info("[STLRTest] Done Injecting Fault")

    def resume_faults(self,fault_name,trans_state):
        ''''
        @summary : Resume the fault and check status
        '''
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_faults")   

        if not trans_state == 'failover_to_mirror' :
            tinctest.logger.info("[STLRTest] fault for %s resume" % fault_name)
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'primary', H ='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault resume failed")   
            tinctest.logger.info("[STLRTest]Done fault for %s resume" % fault_name)

        if trans_state == 'postmaster_reset':
            (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'resume', r = 'mirror', H ='ALL')
            if not ok1:
                tinctest.logger.info("[STLRTest]Failed fault for %s resume on mirror" % fault_name)

        if trans_state == 'failover_to_primary' :
            self.check_fault_status(fault_name,'completed')

    def checkPSQLRun(self, test):
        '''Check if the psql run started in parallel is over before running the _post.sql '''
        tinctest.logger.info("[STLRTest] Running checkPSQLRun")   
        cmd_str = 'ps -ef|grep sub_transaction_limit_removal|grep psql'
        while(1):
            is_running = 0 
            (rc , out) = shell.run(cmd_str)
            for line in out:
                if '%s' %test in line:
                    is_running = 1 
            if is_running == 0:
                return True
            else:
                sleep(10)
        return False
        

    def resume_filerep_resync(self):
        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running resume_filerep_resync")   

        tinctest.logger.info("[STLRTest] fault for failover_to_mirror resume")
        (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'resume', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done fault for failover_to_mirror resume")
        sleep(10)

    def stop_start_validate(self, expect_down_segments=False):
        """
        Do gpstop -i, gpstart and see if all segments come back up fine 
        """        
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running stop_start_validate")   

        tinctest.logger.info("[STLRTest]Shutting down the cluster")
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        if not expect_down_segments:
            if not ok:
                raise Exception('[STLRTest]Problem while shutting down the cluster')
            tinctest.logger.info("[STLRTest]Successfully shutdown the cluster.")

        tinctest.logger.info("[STLRTest]Restarting the cluster.")
        ok = self.gpstart.run_gpstart_cmd()

        if not ok:
            raise Exception('[STLRTest]Failed to bring the cluster back up')
        tinctest.logger.info("[STLRTest]Successfully restarted the cluster.")
        if not self.anydownsegments():
           raise Exception("[STLRTest]segments were marked down")
        else:
           return (True, "All segments are up")

    def run_gprecoverseg(self,recover_option):
        '''
        @summary : Call gpecoverseg full or incremental to bring back the cluster to sync
        '''
        self.gpr = GpRecover()

        tinctest.logger.info("[STLRTest] Running run_gprecoverseg")   

        if recover_option == 'full':
            self.gpr.full()
        else:
            self.gpr.incremental()

        self.gpr.wait_till_insync_transition()
        
    def run_restart_database(self):
        '''
        @summary : Restart the database
        '''
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        tinctest.logger.info("[STLRTest] Running run_restart_database")   
        ok = self.gpstop.run_gpstop_cmd(immediate = 'i')
        tinctest.logger.info(ok)
        ok = self.gpstart.run_gpstart_cmd()
        tinctest.logger.info(ok)       
       
    def reset_faults(self,fault_name,current_cluster_state):
        ''''
        @summary : Reset the faults at the end of test 
        '''
        self.util = Filerepe2e_Util()
        tinctest.logger.info("[STLRTest] Running reset_faults")   

        tinctest.logger.info("[STLRTest] Resetting fault before ending test")

        (ok1,out1) = self.util.inject_fault(f=fault_name, m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting %s fault" %(fault_name))

        if current_cluster_state == 'resync':
            (ok1,out1) = self.util.inject_fault(f='filerep_resync', m = 'async', y = 'reset', r = 'primary', H ='ALL')
            if not ok1:
                raise Exception("[STLRTest]Fault injection failed")   
            tinctest.logger.info("[STLRTest]Done filerep_resync fault")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting checkpoint fault" )
        
    def do_gpcheckcat(self, dbname=None, alldb=False, online=False, outputFile='checkcat.out', outdir=None):
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running do_gpcheckcat")
        self.dbstate.check_catalog()
        return True

    def _validation(self):
        '''
        @summary :gpcheckcat and gpcheckmirrorintegrity
        '''
        
        ###psql.run_shell_command("CHECKPOINT; CHECKPOINT; CHECKPOINT;CHECKPOINT; CHECKPOINT;")
        ###sleep(30) # sleep for some time for the segments to be in sync before validation
 
        self.dbstate = DbStateClass('run_validation')
        tinctest.logger.info("[STLRTest] Running _validation")

        outfile = local_path("subt_checkcat.out")
        self.dbstate.check_catalog(outputFile=outfile)
         
        self.dbstate.check_mirrorintegrity()

    def inject_and_resume_fault(self, fault_name, trans_state):
        self.check_fault_status(fault_name, 'triggered')
        self.filerep_fault(trans_state)
        if trans_state == 'failover_to_mirror' :
            PSQL.run_sql_file(local_path('test_while_ct.sql'))
        self.resume_faults(fault_name, trans_state)

    def run_post_sqls(self, fault_name ='', trans_state=''):
        PSQL.wait_for_database_up();
        if (trans_state == 'failover_to_primary' or trans_state == ''):   
            post_sql = "failover_sql/subt_create_table_ao_post_commit"
        else:
            post_sql = "failover_sql/subt_create_table_ao_post_abort"       
            
        sql_file = post_sql+".sql"
        ans_file = post_sql+".ans"
        out_file = post_sql+".out"

        PSQL.run_sql_file(sql_file = local_path(sql_file), out_file = local_path(out_file))
        diff_res = Gpdiff.are_files_equal(local_path(out_file), local_path(ans_file))
        
        if not diff_res:
           self.fail("[STLRTest]Gpdiff failed for : %s %s" %(fault_name, trans_state))

    def reset_all_faults(self):
        ''''
        @summary : Reset all faults on primary and mirror 
        '''
        tinctest.logger.info("[STLRTest] Running reset_all_faults")   
        self.util = Filerepe2e_Util()

        (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting all faults on primary")

        (ok1,out1) = self.util.inject_fault(f='all', m = 'async', y = 'reset', r = 'mirror', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")   
        tinctest.logger.info("[STLRTest]Done resetting all faults fault on mirror") 

    def kill_zombies(self):
        ''' 
        @summary : There are stray zombie processes running after each test. This method clears them 
        '''
        tinctest.logger.info("[STLRTest] Running kill_zombies")
        cmd_str = "ps -ef | grep \"port\" | awk '{print $3 \"#\" $2}' | grep -w 1"
        cmd = Command("shell_command", cmd_str)
        tinctest.logger.info('Executing command: %s : %s' %("shell command", cmd_str))
        cmd.run()
        result = cmd.get_results()
        out = result.stdout
        lines = out.split('\n')
        for line in lines:
            pids = line.split('#')
            if pids[0] == '1':
               kill_str= "kill -9 %s" %(pids[1])
               cmd2 = Command("kill_command", kill_str)
               cmd2.run()


    def skip_checkpoint(self):
        ''' 
        @summary : Routine to inject fault that skips checkpointing 
        '''

        self.util = Filerepe2e_Util()

        tinctest.logger.info("[STLRTest] Running skip_checkpoint")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'reset', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done resetting the checkpoint fault")

        (ok1,out1) = self.util.inject_fault(f='checkpoint', m = 'async', y = 'skip', r = 'primary', H ='ALL')

        if not ok1:
            raise Exception("[STLRTest]Fault injection failed")
        tinctest.logger.info("[STLRTest]Done skipping the checkpoint fault")

    def method_setup(self):
        tinctest.logger.info("Performing setup tasks")
        gpfs=Gpfilespace()
        gpfs.create_filespace('subt_filespace_a')

    def cleandb(self):
        db = Database()
        db.setupDatabase('gptest')
Example #28
0
class gpStartTestCase(unittest.TestCase):
	def setUp(self):
                self.basedir = os.path.dirname(__file__)
		self.gphome = os.environ.get('GPHOME')
                self.gp=GpStart()
                self.gps=GpStop()
                self.MAX_TRY=3
                self.TIMEOUT=90
                self.MAXPARALLELSEG=60

    	def test_gpstart_logDir(self):
                tinctest.logger.info("Running test_gpstart_logDir")
                self.logdir=''.join([self.basedir,'/logs'])
                cmd = Command(name='Remove gpstop<nnnn>.log', cmdStr='rm -f %s/gpstop*' % (self.logdir))
                tinctest.logger.info("Removing gpstop<nnnn>.log : %s" % cmd)
                cmd.run(validateAfter=True)
                result = cmd.get_results()
                if result.rc != 0 or result.stderr:
                   raise gpstopException("Not able to delete existing gpstop<nnnn>.log")
                lcmd=' '.join(['ls',self.logdir, '| wc -l'])
                res=False
                if self.is_not_running_gpdb():
                   res=self.gp.run_gpstart_cmd(logdir=self.logdir)
                if res is not True:
                   raise GPstopError("Error : run_gpstart_cmd(logdir) failed \n")
                cmd = Command(name='count of  gpstart<nnnn>.log', cmdStr=' %s ' % (lcmd))
                tinctest.logger.info("Count gpstart<nnnn>.log : %s" % cmd)
                cmd.run(validateAfter=True)
                result = cmd.get_results()
                if result.rc != 0 or result.stderr:
                   raise gpstopException("Not able to get count of gpstart<nnnn>.log")
                assert int(result.stdout) > 0

    	def test_gpstart_getversion(self):
        	res=self.gp.get_version()
		self.assertTrue(res)
    	def test_gpstart_restrict(self):
                tinctest.logger.info("Running test_gpstart_restrict")
                if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd(restrict ='y')
		self.assertTrue(res)

    	def test_gpstart_timeout(self):
                tinctest.logger.info("Running test_gpstart_timeout")
                if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd(timeout = self.TIMEOUT)
		self.assertTrue(res)

    	def test_gpstart_parallelproc(self):
                tinctest.logger.info("Running test_gpstart_parallelproc")
                if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd(parallelproc=self.MAXPARALLELSEG)
		self.assertTrue(res)

    	def test_gpstart_noprompt(self):
                tinctest.logger.info("Running test_gpstart_noprompt")
                if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd()
		self.assertTrue(res)


        def test_gpstart_cmd_masterOnly(self):
                tinctest.logger.info("Running test_gpstart_cmd_masterOnly")
                if self.is_not_running_gpdb():
			self.gp.run_gpstart_cmd(masteronly='y')
		res=self.gpstartCheck()
		self.assertTrue(res)

        def test_gpstart_cmd_quiet(self):
                tinctest.logger.info("Running test_gpstart_cmd_quiet")
                if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd(quietmode='y')
		self.assertTrue(res)
        def test_gpstart_cmd_startcluster(self):
                tinctest.logger.info("Running test_gpstart_cmd_startcluster")
                if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd()
		self.assertTrue(res)

        def test_gpstart_cmd_verbose(self):
                tinctest.logger.info("Running test_gpstart_cmd_verbose")
		if self.is_not_running_gpdb():
			res=self.gp.run_gpstart_cmd(verbose='y')
		self.assertTrue(res)

    	def test_gpstart_check(self):
        	if not self.gpstartCheck():
			res2=self.gp.gp.run_gpstart_cmd()
			res=self.gpstartCheck()
			self.assertTrue(res)

	def test_func_gpstart_quiet(self):
               if self.is_not_running_gpdb():
                        res=self.gp.gpstart_quiet()
               self.assertTrue(res)

	

       	def is_not_running_gpdb(self):
                res=False
                ctr=0
                while ctr < self.MAX_TRY:
                    ctr=ctr+1
                    res=self.gpstartCheck()
                    if res is True:
			self.gps.run_gpstop_cmd(quietmode='y')
                    else:
                        return True
                if (res is True and ctr < self.MAX_TRY):
                        return True
                else:
                        return False


        def gpstartCheck(self):
        	"""
        	Checks if the cluster is brought up correctly and all segments are in sync
        	"""
        	bashCmd = 'source ' + (self.gphome)+'/greenplum_path.sh;'+(self.gphome)+'/bin/pg_ctl status -D $MASTER_DATA_DIRECTORY | grep \'pg_ctl: server is running\''
        	dbStart = Command(name='gpstartCheck ',cmdStr=bashCmd)
        	dbStart.run()
        	rc = dbStart.get_results().rc
        	if rc != 0:
           		return False
        	return True
Example #29
0
class FtsTransitions(MPPTestCase):

    def __init__(self, methodName):
        self.pgport = os.environ.get('PGPORT')
        self.fileutil = Filerepe2e_Util()
        self.gpstate = Gpstate()
        self.gpprimarymirror = Gpprimarymirror()
        self.gpstart = GpStart()
        self.gpstop = GpStop()
        super(FtsTransitions,self).__init__(methodName)

    def kill_first_mirror(self):
        mirror_data_loc = self.get_default_fs_loc(role='m',content=0)
        gpconfig = GPDBConfig()
        (host, port) = gpconfig.get_hostandport_of_segment(psegmentNumber = 0, pRole = 'm')    
        cmdString = 'ps -ef|grep -v grep|grep \'%s\'|awk \'{print $2}\'|xargs kill -9'%mirror_data_loc
        remote = Command(name ='kill first mirror', cmdStr = cmdString, ctxt=2, remoteHost=host)
        remote.run() 
        tinctest.logger.info('run command %s'%cmdString)
        rc = remote.get_results().rc    
        result = remote.get_results().stdout
        tinctest.logger.info('Command returning, rc: %s, result: %s'%(rc,result))

    def kill_master_process(self, ProcName=None):
        cmdString = 'ps -ef|grep postgres| grep %s | grep \'%s\'| awk \'{print $2}\'|xargs kill -9'%(self.pgport,ProcName) 
        cmd = Command('kill process on master', cmdStr = cmdString)
        cmd.run()
        tinctest.logger.info('run command %s'%cmdString)
        rc = cmd.get_results().rc    
        result = cmd.get_results().stdout
        tinctest.logger.info('Command returning, rc: %s, result: %s'%(rc,result))


    def get_default_fs_loc(self, role='m', content=0):
        fs_sql = '''select fselocation from pg_filespace_entry
                    where fsefsoid = 3052 and fsedbid = (select dbid from gp_segment_configuration
                    where role = \'%s\' and content = %s);'''%(role,content)
        result = PSQL.run_sql_command(fs_sql, flags = '-q -t', dbname= 'template1')
        result = result.strip()
        filespace_loc = result.split('\n')
        return filespace_loc[0]
  
    def set_faults(self,fault_name, type, role='mirror', port=None, occurence=None, sleeptime=None, seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.fileutil.inject_fault(f=fault_name, y=type, r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id)

    def resume_faults(self,fault_name, role='mirror'):
        ''' Resume the fault issues '''
        self.fileutil.inject_fault(f=fault_name, y='resume', r=role)

    def incremental_recoverseg(self, workerPool=False):
        gprecover = GpRecover(GPDBConfig())
        gprecover.incremental(workerPool)

    def run_recoverseg_if_ct(self):
        gpconfig = GPDBConfig()
        num_down = gpconfig.count_of_nodes_in_mode('c')
        if (int(num_down) > 0):
            self.incremental_recoverseg()

    def wait_till_change_tracking(self):
        self.fileutil.wait_till_change_tracking_transition()

    def wait_till_insync(self):
        gprecover = GpRecover(GPDBConfig())
        gprecover.wait_till_insync_transition()

    def run_gpstate(self, type, phase):
        self.gpstate.run_gpstate(type, phase)

    def run_gpprimarymirror(self):
        self.gpprimarymirror.run_gpprimarymirror()

    def verify_gpprimarymirror_output(self, total_resync=0, cur_resync=0):
        status = self.gpprimarymirror.verify_gpprimarymirror_output(total_resync, cur_resync)
        self.assertTrue(status, 'Total and Cur resync object count mismatch')

    def run_gpstate_shell_cmd(self, options):
        self.gpstate.run_gpstate_shell_cmd(options)

    def verify_gpstate_output(self):
        status = self.gpstate.verify_gpstate_output()
        self.assertTrue(status, 'Total and Cur resync object count mismatch')

    def run_trigger_sql(self, wait_for_db=True):
        ''' Run a sql statement to trigger postmaster reset '''
        PSQL.run_sql_file(local_path('test_ddl.sql'))
        if wait_for_db:
            PSQL.wait_for_database_up()

    def run_fts_test_ddl_dml(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml.sql'))

    def run_fts_test_ddl_dml_before_ct(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml_before_ct.sql'))

    def run_fts_test_ddl_dml_ct(self):
        PSQL.run_sql_file(local_path('fts_test_ddl_dml_ct.sql'))

    def restart_db(self):
        self.gpstop.run_gpstop_cmd(immediate = True)
        self.gpstart.run_gpstart_cmd()

    def stop_db_with_no_rc_check(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()

    def start_db_with_no_rc_check(self):
        ''' Gpstart and dont check for rc '''
        cmd = Command('Gpstart_a', 'gpstart -a')
        tinctest.logger.info('Executing command: gpstart -a')
        cmd.run()

    def restart_db_with_no_rc_check(self):
        self.stop_db_with_no_rc_check()
        self.start_db_with_no_rc_check()

    def check_fault_status(self, fault_name, seg_id=None, role=None):
        status = self.fileutil.check_fault_status(fault_name = fault_name, status ='triggered', max_cycle=20, role=role, seg_id=seg_id)
        self.assertTrue(status, 'The fault is not triggered in the time expected')

    def cluster_state(self):
        gpconfig = GPDBConfig()
        state = gpconfig.is_not_insync_segments()
        self.assertTrue(state,'The cluster is not up and in sync')
Example #30
0
class GPDBStorageBaseTestCase():
    '''
    Base Class for Storage test-suits like Crash Recovery, 
    Pg_Two_Phase, sub_transaction
    '''
    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = GPDBConfig()

        self.filereputil = Filerepe2e_Util()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpfile = Gpfilespace(self.config)
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')

    def invoke_fault(self,
                     fault_name,
                     type,
                     role='mirror',
                     port=None,
                     occurence=None,
                     sleeptime=None,
                     seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.filereputil.inject_fault(f=fault_name,
                                      y='reset',
                                      r=role,
                                      p=port,
                                      o=occurence,
                                      sleeptime=sleeptime,
                                      seg_id=seg_id)
        self.filereputil.inject_fault(f=fault_name,
                                      y=type,
                                      r=role,
                                      p=port,
                                      o=occurence,
                                      sleeptime=sleeptime,
                                      seg_id=seg_id)
        tinctest.logger.info(
            'Successfully injected fault_name : %s fault_type : %s  occurence : %s '
            % (fault_name, type, occurence))

    def start_db(self):
        '''Gpstart '''
        rc = self.gpstart.run_gpstart_cmd()
        if not rc:
            raise Exception('Failed to start the cluster')
        tinctest.logger.info('Started the cluster successfully')

    def stop_db(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()

    def get_trigger_status(self, trigger_count, max_cnt=50):
        '''Compare the pg_stat_activity count with the total number of trigger_sqls executed '''
        psql_count = 0
        for i in range(1, trigger_count):
            psql_count = PSQL.run_sql_command(
                'select count(*) from pg_stat_activity;',
                flags='-q -t',
                dbname='postgres')
            sleep(1)
        tinctest.logger.info('Count of trigger sqls %s And it should be %s' %
                             (psql_count, trigger_count))
        if psql_count < trigger_count:
            tinctest.logger.info('coming to the if loop in get_trigger_status')
            return False
        return True

    def check_trigger_sql_hang(self, test_dir):
        '''
        @param ddl_type : create/drop
        @param fault_type : commit/abort/end_prepare_two_phase_sleep
        @description : Return the status of the trigger sqls: whether they are waiting on the fault 
        Since gpfaultinjector has no way to check if all the sqls are triggered, we are using 
        a count(*) on pg_stat_activity and compare the total number of trigger_sqls
        '''
        trigger_dir = local_path('%s_tests/trigger_sql/' % (test_dir))
        trigger_count = len(glob.glob1(trigger_dir, "*.ans"))
        return self.get_trigger_status(trigger_count)

    def get_items_list(test_file):
        ''' Get file contents to a list '''
        with open(test_file, 'r') as f:
            test_list = [line.strip() for line in f]
        return test_list

    def validate_sql(filename):
        ''' Compare the out and ans files '''
        out_file = local_path(filename.replace(".sql", ".out"))
        ans_file = local_path(filename.replace('.sql', '.ans'))
        assert Gpdiff.are_files_equal(out_file, ans_file)

    def run_sql(filename, verify=True):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(sql_file=filename, out_file=out_file)
        if verify == True:
            validate_sql(filename)
class GPDBStorageBaseTestCase():
    '''
    Base Class for Storage test-suits like Crash Recovery, 
    Pg_Two_Phase, sub_transaction
    '''

    def __init__(self, config=None):
        if config is not None:
            self.config = config
        else:
            self.config = GPDBConfig()

        self.filereputil = Filerepe2e_Util()
        self.gprecover = GpRecover(self.config)
        self.gpstop = GpStop()
        self.gpstart = GpStart()
        self.gpfile = Gpfilespace(self.config)
        self.gpverify = GpdbVerify(config=self.config)
        self.dbstate = DbStateClass('run_validation', self.config)
        self.port = os.getenv('PGPORT')

    def invoke_fault(self, fault_name, type, role='mirror', port=None, occurence=None, sleeptime=None, seg_id=None):
        ''' Reset the fault and then issue the fault with the given type'''
        self.filereputil.inject_fault(f=fault_name, y='reset', r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id)
        self.filereputil.inject_fault(f=fault_name, y=type, r=role, p=port , o=occurence, sleeptime=sleeptime, seg_id=seg_id)
        tinctest.logger.info('Successfully injected fault_name : %s fault_type : %s  occurence : %s ' % (fault_name, type, occurence))

    def start_db(self):
        '''Gpstart '''
        rc = self.gpstart.run_gpstart_cmd()
        if not rc:
            raise Exception('Failed to start the cluster')
        tinctest.logger.info('Started the cluster successfully')

    def stop_db(self):
        ''' Gpstop and dont check for rc '''
        cmd = Command('Gpstop_a', 'gpstop -a')
        tinctest.logger.info('Executing command: gpstop -a')
        cmd.run()


    def get_trigger_status(self, trigger_count,max_cnt=50):
        '''Compare the pg_stat_activity count with the total number of trigger_sqls executed '''
        psql_count=0
        for i in range(1,trigger_count):
            psql_count = PSQL.run_sql_command('select count(*) from pg_stat_activity;', flags='-q -t', dbname='postgres')
            sleep(1) 
        tinctest.logger.info('Count of trigger sqls %s And it should be %s' % (psql_count, trigger_count))
        if psql_count < trigger_count :
            tinctest.logger.info('coming to the if loop in get_trigger_status')
            return False
        return True


    def check_trigger_sql_hang(self, test_dir):
        '''
        @param ddl_type : create/drop
        @param fault_type : commit/abort/end_prepare_two_phase_sleep
        @description : Return the status of the trigger sqls: whether they are waiting on the fault 
        Since gpfaultinjector has no way to check if all the sqls are triggered, we are using 
        a count(*) on pg_stat_activity and compare the total number of trigger_sqls
        '''
        trigger_dir = local_path('%s_tests/trigger_sql/' % (test_dir))
        trigger_count = len(glob.glob1(trigger_dir,"*.ans"))
        return self.get_trigger_status(trigger_count)


    def get_items_list(test_file):
        ''' Get file contents to a list '''
        with open(test_file, 'r') as f:
             test_list = [line.strip() for line in f]
        return test_list

    def validate_sql(filename):
        ''' Compare the out and ans files '''
        out_file = local_path(filename.replace(".sql", ".out"))
        ans_file = local_path(filename.replace('.sql' , '.ans'))
        assert Gpdiff.are_files_equal(out_file, ans_file)

    def run_sql(filename, verify=True):
        ''' Run the provided sql and validate it '''
        out_file = local_path(filename.replace(".sql", ".out"))
        PSQL.run_sql_file(sql_file = filename, out_file = out_file)
        if verify == True:
           validate_sql(filename)