Ejemplo n.º 1
0
def update_on_segments(update_cmds, batch_size):

    num_workers = min(batch_size, len(update_cmds))
    pool = WorkerPool(num_workers)
    for uc in update_cmds:
        pool.addCommand(uc)
    try:
        pool.join()
    except Exception as e:
        pool.haltWork()
        pool.joinWorkers()
    failure = False
    for cmd in pool.getCompletedItems():
        r = cmd.get_results()
        if not cmd.was_successful():
            logger.error("Unable to update pg_hba conf on primary segment: " +
                         str(r))
            failure = True

    pool.haltWork()
    pool.joinWorkers()
    if failure:
        logger.error("Unable to update pg_hba.conf on the primary segments")
        raise Exception(
            "Unable to update pg_hba.conf on the primary segments.")
Ejemplo n.º 2
0
def restore_pg_hba_on_segment(gparr):
    """
    Restore the pg_hba.conf on all of the segments
    present in the array
    """
    logger.debug('Restoring pg_hba.conf file on segments...')

    host_to_seg_map = defaultdict(list)
    for seg in gparr.getDbList():
        if not seg.isSegmentMaster() and not seg.isSegmentStandby():
            host_to_seg_map[seg.getSegmentHostName()].append(seg.getSegmentDataDirectory())

    pool = WorkerPool(numWorkers=DEFAULT_BATCH_SIZE)

    try:
        for host, data_dirs_list in host_to_seg_map.items():
            pickled_data_dirs_list = base64.urlsafe_b64encode(pickle.dumps(data_dirs_list))
            cmdStr = "$GPHOME/lib/python/gppylib/operations/initstandby.py -d %s -r" % pickled_data_dirs_list
            cmd = Command('Restore the pg_hba.conf on remote hosts', cmdStr=cmdStr , ctxt=REMOTE, remoteHost=host)
            pool.addCommand(cmd)

        pool.join()

        for item in pool.getCompletedItems():
            result = item.get_results()
            if result.rc != 0:
                logger.error('Unable to restore pg_hba.conf %s' % str(result.stderr))
                logger.error('Please check the segment for more details')

    finally:
        pool.haltWork()
        pool.joinWorkers()
        pool = None
Ejemplo n.º 3
0
    def execute(self):
        pool = WorkerPool()
        try:
            for seg in self.segments:
                datadir = seg.getSegmentDataDirectory()
                postmaster_pid_file = '%s/postmaster.pid' % datadir
                shared_mem = None
                if os.path.isfile(postmaster_pid_file):
                    with open(postmaster_pid_file) as fp:
                        shared_mem = fp.readlines()[-1].split()[-1].strip()
                if shared_mem:
                    cmd = Command('clean up shared memory', cmdStr="ipcrm -m %s" % shared_mem) 
                    pool.addCommand(cmd)
                pool.join()

            for item in pool.getCompletedItems():
                result = item.get_results()

                # This code is usually called after a GPDB segment has
                # been terminated.  In that case, it is possible that
                # the shared memory has already been freed by the
                # time we are called to clean up.  Due to this race
                # condition, it is possible to get an `ipcrm: invalid
                # id1` error from ipcrm.  We, therefore, ignore it.
                if result.rc != 0 and not result.stderr.startswith("ipcrm: invalid id"):
                    raise Exception('Unable to clean up shared memory for segment: (%s)' % (result.stderr))
        finally:
            pool.haltWork()
            pool.joinWorkers()
            pool = None
Ejemplo n.º 4
0
class ConcurrentFilespaceMoveTestCase(unittest.TestCase):
    """ This test suite tests the scenario of running gpfilespace concurrently while
        trying to move the filespace. 
        The expected behavior is that only one of the processes succeeds and the 
        rest error out."""

    ALREADY_RUNNING_MSG = 'Another instance of gpfilespace is already running!'

    def setUp(self):
        self.pool = None
        self.pool = WorkerPool()

    def tearDown(self):
        if self.pool:
            self.pool.haltWork()
            self.pool.joinWorkers()
            self.pool.join()

    def get_move_filespace_cmd(self, filespace='myfspc', file_type=FileType.TEMPORARY_FILES):
        if file_type == FileType.TEMPORARY_FILES:
            file_type = 'movetempfiles'
        elif file_type == FileType.TRANSACTION_FILES:
            file_type = 'movetransfiles'

        return Command(name='move filespace', cmdStr='gpfilespace --%s %s' % (file_type, filespace))

    def run_concurrently(self, cmd_list):

        for cmd in cmd_list:
            self.pool.addCommand(cmd)
        self.pool.join()

    def check_concurrent_execution_result(self, execution_results):

        succeeded = 0
        for cmd in execution_results:
            results = cmd.get_results().stdout.strip()
            if self.ALREADY_RUNNING_MSG in results:
                continue
            succeeded += 1

        self.assertEqual(succeeded, 1)
            
    def test00_move_temp_filespace(self):

        cmd_list = [self.get_move_filespace_cmd(file_type=FileType.TEMPORARY_FILES) for i in range(2)]
        self.run_concurrently(cmd_list)
        self.check_concurrent_execution_result(self.pool.getCompletedItems())
            
    def test01_move_trans_filespace(self):

        cmd_list = [self.get_move_filespace_cmd(file_type=FileType.TRANSACTION_FILES) for i in range(2)]
        self.run_concurrently(cmd_list)
        self.check_concurrent_execution_result(self.pool.getCompletedItems())

    def test02_move_temp_and_trans_filespace(self):
        
        cmd_list = [self.get_move_filespace_cmd(file_type=FileType.TEMPORARY_FILES), self.get_move_filespace_cmd(file_type=FileType.TRANSACTION_FILES)]
        self.run_concurrently(cmd_list) 
        self.check_concurrent_execution_result(self.pool.getCompletedItems())
Ejemplo n.º 5
0
    def validate_nic_down(self):
        """     
            Ping validation on the nics.
        """     

        pool = WorkerPool()

        try:    
            for nic, hostname in self.nic_to_address_map:
                address = self.nic_to_address_map[(nic, hostname)]
                cmd = Ping('ping validation', address, ctxt=REMOTE, remoteHost='localhost')
                pool.addCommand(cmd)
            pool.join()

            for cmd in pool.getCompletedItems():
                results = cmd.get_results()
                if results.rc == 0:
                    return False
        finally:
            pool.haltWork()
            pool.joinWorkers()
            pool.join()

        tinctest.logger.info("Successfully brought down nics ...")   
        return True
Ejemplo n.º 6
0
    def execute(self):
        pool = WorkerPool()
        try:
            for seg in self.segments:
                datadir = seg.getSegmentDataDirectory()
                postmaster_pid_file = '%s/postmaster.pid' % datadir
                shared_mem = None
                if os.path.isfile(postmaster_pid_file):
                    with open(postmaster_pid_file) as fp:
                        shared_mem = fp.readlines()[-1].split()[-1].strip()
                if shared_mem:
                    cmd = Command('clean up shared memory',
                                  cmdStr="ipcrm -m %s" % shared_mem)
                    pool.addCommand(cmd)
                pool.join()

            for item in pool.getCompletedItems():
                result = item.get_results()

                # This code is usually called after a GPDB segment has
                # been terminated.  In that case, it is possible that
                # the shared memory has already been freed by the
                # time we are called to clean up.  Due to this race
                # condition, it is possible to get an `ipcrm: invalid
                # id1` error from ipcrm.  We, therefore, ignore it.
                if result.rc != 0 and not result.stderr.startswith(
                        "ipcrm: invalid id"):
                    raise Exception(
                        'Unable to clean up shared memory for segment: (%s)' %
                        (result.stderr))
        finally:
            pool.haltWork()
            pool.joinWorkers()
            pool = None
class GpMirrorListToBuildTestCase(GpTestCase):

    def setUp(self):
        self.pool = WorkerPool()

    def tearDown(self):
        # All background threads must be stopped, or else the test runner will
        # hang waiting. Join the stopped threads to make sure we're completely
        # clean for the next test.
        self.pool.haltWork()
        self.pool.joinWorkers()
        super(GpMirrorListToBuildTestCase, self).tearDown()

    def test_pg_rewind_parallel_execution(self):
        self.apply_patches([
            # Mock CHECKPOINT command in run_pg_rewind() as successful
            patch('gppylib.db.dbconn.connect', return_value=Mock()),
            patch('gppylib.db.dbconn.execSQL', return_value=Mock()),
            # Mock the command to remove postmaster.pid as successful
            patch('gppylib.commands.base.Command.run', return_value=Mock()),
            patch('gppylib.commands.base.Command.get_return_code', return_value=0),
            # Mock all pg_rewind commands to be not successful
            patch('gppylib.commands.base.Command.was_successful', return_value=False),
            patch('gppylib.commands.base.Command.get_stdout', return_value='Mocking results')
        ])
        from gppylib.operations.buildMirrorSegments import GpMirrorListToBuild
        # WorkerPool is the only valid parameter required in this test
        # case.  The test expects the workers to get a pg_rewind
        # command to run (and the command should fail to run).
        g = GpMirrorListToBuild(1, self.pool, 1,1)
        rewindInfo = {}
        p0 = Segment.initFromString("2|0|p|p|s|u|sdw1|sdw1|40000|/data/primary0")
        p1 = Segment.initFromString("3|1|p|p|s|u|sdw2|sdw2|40001|/data/primary1")
        m0 = Segment.initFromString("4|0|m|m|s|u|sdw2|sdw2|50000|/data/mirror0")
        m1 = Segment.initFromString("5|1|m|m|s|u|sdw1|sdw1|50001|/data/mirror1")
        rewindInfo[p0.dbid] = GpMirrorListToBuild.RewindSegmentInfo(
            p0, p0.address, p0.port)
        rewindInfo[p1.dbid] = GpMirrorListToBuild.RewindSegmentInfo(
            p1, p1.address, p1.port)
        rewindInfo[m0.dbid] = GpMirrorListToBuild.RewindSegmentInfo(
            m0, m0.address, m0.port)
        rewindInfo[m1.dbid] = GpMirrorListToBuild.RewindSegmentInfo(
            m1, m1.address, m1.port)

        # Test1: all 4 pg_rewind commands should fail due the "was_successful" patch
        failedSegments = g.run_pg_rewind(rewindInfo)
        self.assertEqual(len(failedSegments), 4)
        # The returned list of failed segments should contain items of
        # type gparray.Segment
        failedSegments.remove(p0)
        self.assertTrue(failedSegments[0].getSegmentDbId() > 0)

        # Test2: patch it such that no failures this time
        patch('gppylib.commands.base.Command.was_successful', return_value=True).start()
        failedSegments = g.run_pg_rewind(rewindInfo)
        self.assertEqual(len(failedSegments), 0)
Ejemplo n.º 8
0
def update_pg_hba_conf_on_segments(gparr,
                                   standby_host,
                                   is_hba_hostnames=False,
                                   unreachable_hosts=[]):
    """
    Updates the pg_hba.conf on all of the segments 
    present in the array
    """
    logger.debug('Updating pg_hba.conf file on segments...')
    standby_pg_hba_info = get_standby_pg_hba_info(standby_host,
                                                  is_hba_hostnames)
    json_standby_pg_hba_info = json.dumps(standby_pg_hba_info)

    host_to_seg_map = defaultdict(list)
    for seg in gparr.getDbList():
        if not seg.isSegmentCoordinator() and not seg.isSegmentStandby():
            host_to_seg_map[seg.getSegmentHostName()].append(
                seg.getSegmentDataDirectory())

    pool = WorkerPool(numWorkers=DEFAULT_BATCH_SIZE)

    try:
        for host, data_dirs_list in list(host_to_seg_map.items()):
            if host in unreachable_hosts:
                logger.warning(
                    "Manual update of the pg_hba_conf files for all segments on unreachable host %s will be required."
                    % host)
                continue
            json_data_dirs_list = json.dumps(data_dirs_list)
            cmdStr = "$GPHOME/lib/python/gppylib/operations/initstandby.py -p '%s' -d '%s'" % (
                json_standby_pg_hba_info, json_data_dirs_list)
            cmd = Command('Update the pg_hba.conf on remote hosts',
                          cmdStr=cmdStr,
                          ctxt=REMOTE,
                          remoteHost=host)
            pool.addCommand(cmd)

        pool.join()

        for item in pool.getCompletedItems():
            result = item.get_results()
            if result.rc != 0:
                logger.error('Unable to update pg_hba.conf %s' %
                             str(result.stderr))
                logger.error(
                    'Please check the segment log file for more details')

    finally:
        pool.haltWork()
        pool.joinWorkers()
        pool = None
Ejemplo n.º 9
0
 def _get_pgcontrol_data_from_segments(self, gpdb_list):
     pool = WorkerPool(numWorkers=self.workers)
     try:
         for gpdb in gpdb_list:  # iterate for all segments
             cmd = PgControlData(name='run pg_controldata', datadir=gpdb.getSegmentDataDirectory(),
                                 ctxt=REMOTE, remoteHost=gpdb.getSegmentHostName())
             cmd.gparray_gpdb = gpdb
             pool.addCommand(cmd)
         pool.join()
     finally:
         # Make sure that we halt the workers or else we'll hang
         pool.haltWork()
         pool.joinWorkers()
     return pool.getCompletedItems()
Ejemplo n.º 10
0
 def _get_pgcontrol_data_from_segments(self, gpdb_list):
     pool = WorkerPool(numWorkers=self.workers)
     try:
         for gpdb in gpdb_list:  # iterate for all segments
             cmd = PgControlData(name='run pg_controldata',
                                 datadir=gpdb.getSegmentDataDirectory(),
                                 ctxt=REMOTE,
                                 remoteHost=gpdb.getSegmentHostName())
             cmd.gparray_gpdb = gpdb
             pool.addCommand(cmd)
         pool.join()
     finally:
         # Make sure that we halt the workers or else we'll hang
         pool.haltWork()
         pool.joinWorkers()
     return pool.getCompletedItems()
Ejemplo n.º 11
0
    def bring_down_nic(self, nics, hostname):
        """
            Bring down nics based on the input nic names
        """
        if nics is None:
            return False

        pool = WorkerPool()

        try:
            #get the ip address of the interface
            for nic in nics:
                cmd = Command(
                    name='get the ip of the interface',
                    cmdStr=
                    "/sbin/ifconfig %s | grep \'inet addr:\' | cut -d: -f2 | awk \'{ print $1}\'"
                    % nic,
                    ctxt=REMOTE,
                    remoteHost=hostname)
                cmd.run(validateAfter=True)
                results = cmd.get_results()
                if results.rc != 0:
                    raise Exception('Unable to map interface to ipaddress')

                self.nic_to_address_map[(
                    nic, hostname)] = results.stdout.split()[0].strip()

            for nic in nics:
                tinctest.logger.info("Bringing down %s:%s ..." %
                                     (hostname, nic))
                cmd = Command(name='bring NIC down',
                              cmdStr='sudo /sbin/ifdown %s' % nic,
                              ctxt=REMOTE,
                              remoteHost=hostname)
                pool.addCommand(cmd)

            pool.join()
            for cmd in pool.getCompletedItems():
                results = cmd.get_results()
                if results.rc != 0:
                    return False
        finally:
            pool.haltWork()
            pool.joinWorkers()
            pool.join()

        return True
Ejemplo n.º 12
0
def findFsDetails():
    global serverFSMap
    try:
        #find the mount points in parallel
        pool = WorkerPool()

        for hname in serverFSMap.keys():
            hname.strip()
            subCmd = "df -P %s" % (serverFSMap[hname])
            cmdStr = 'ssh -o PasswordAuthentication=no %s "%s"' % (hname,
                                                                   subCmd)
            pool.addCommand(Command(hname, cmdStr, REMOTE, hname))
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc == 0:
                df_with_header = i.results.stdout.strip()
                df_list = df_with_header.splitlines()
                df_list.pop(0)
                fsList = serverFSMap[i.remoteHost].split()
                if len(df_list) != len(fsList):
                    print "Mismatch"
                    continue
                for df_vals in df_list:
                    df_val = df_vals.split()
                    fsDetailsMap[fsList.pop(0).strip()] = [
                        i.remoteHost, df_val[0], df_val[5]
                    ]
            else:
                print("Failure in talking to host %s" % (i.remoteHost))

        pool.join()
        pool.haltWork()
        pool.joinWorkers()

    except Exception, e:
        print e.__str__()
        pool.join()
        pool.haltWork()
        pool.joinWorkers()
Ejemplo n.º 13
0
def cleanup_pg_hba_backup_on_segment(gparr, unreachable_hosts=[]):
    """
    Cleanup the pg_hba.conf on all of the segments
    present in the array
    """
    logger.debug('Removing pg_hba.conf backup file on segments...')

    host_to_seg_map = defaultdict(list)
    for seg in gparr.getDbList():
        if not seg.isSegmentCoordinator() and not seg.isSegmentStandby():
            host_to_seg_map[seg.getSegmentHostName()].append(
                seg.getSegmentDataDirectory())

    pool = WorkerPool(numWorkers=DEFAULT_BATCH_SIZE)

    try:
        for host, data_dirs_list in list(host_to_seg_map.items()):
            if host in unreachable_hosts:
                continue
            json_data_dirs_list = json.dumps(data_dirs_list)
            cmdStr = "$GPHOME/lib/python/gppylib/operations/initstandby.py -d '%s' -D" % json_data_dirs_list
            cmd = Command('Cleanup the pg_hba.conf backups on remote hosts',
                          cmdStr=cmdStr,
                          ctxt=REMOTE,
                          remoteHost=host)
            pool.addCommand(cmd)

        pool.join()

        for item in pool.getCompletedItems():
            result = item.get_results()
            if result.rc != 0:
                logger.error('Unable to cleanup pg_hba.conf backup file %s' %
                             str(result.stderr))
                logger.error('Please check the segment for more details')

    finally:
        pool.haltWork()
        pool.joinWorkers()
        pool = None
Ejemplo n.º 14
0
def findFsDetails():
    global serverFSMap
    try:
        # find the mount points in parallel
        pool = WorkerPool()

        for hname in serverFSMap.keys():
            hname.strip()
            subCmd = "df -P %s" % (serverFSMap[hname])
            cmdStr = 'ssh -o PasswordAuthentication=no %s "%s"' % (hname, subCmd)
            pool.addCommand(Command(hname, cmdStr, REMOTE, hname))
        pool.join()
        items = pool.getCompletedItems()
        for i in items:
            if i.results.rc == 0:
                df_with_header = i.results.stdout.strip()
                df_list = df_with_header.splitlines()
                df_list.pop(0)
                fsList = serverFSMap[i.remoteHost].split()
                if len(df_list) != len(fsList):
                    print "Mismatch"
                    continue
                for df_vals in df_list:
                    df_val = df_vals.split()
                    fsDetailsMap[fsList.pop(0).strip()] = [i.remoteHost, df_val[0], df_val[5]]
            else:
                print("Failure in talking to host %s" % (i.remoteHost))

        pool.join()
        pool.haltWork()
        pool.joinWorkers()

    except Exception, e:
        print e.__str__()
        pool.join()
        pool.haltWork()
        pool.joinWorkers()
Ejemplo n.º 15
0
    def execute(self):
        pool = WorkerPool()
        try:
            for seg in self.segments:
                datadir = seg.getSegmentDataDirectory()
                postmaster_pid_file = '%s/postmaster.pid' % datadir
                shared_mem = None
                if os.path.isfile(postmaster_pid_file):
                    with open(postmaster_pid_file) as fp:
                        shared_mem = fp.readlines()[-1].split()[-1].strip()
                if shared_mem:
                    cmd = Command('clean up shared memory', cmdStr="ipcrm -m %s" % shared_mem) 
                    pool.addCommand(cmd)
                pool.join()

            for item in pool.getCompletedItems():
                result = item.get_results()

                if result.rc != 0:
                    raise Exception('Unable to clean up shared memory for segment: (%s)' % (result.stderr))
        finally:
            pool.haltWork()
            pool.joinWorkers()
            pool = None
Ejemplo n.º 16
0
    def bring_down_nic(self, nics, hostname):
        """
            Bring down nics based on the input nic names
        """ 
        if nics is None:
            return False

        pool = WorkerPool()

        try:    
            #get the ip address of the interface
            for nic in nics:
                cmd = Command(name='get the ip of the interface', cmdStr="/sbin/ifconfig %s | grep \'inet addr:\' | cut -d: -f2 | awk \'{ print $1}\'" % nic, ctxt=REMOTE, remoteHost=hostname)
                cmd.run(validateAfter=True)
                results = cmd.get_results()
                if results.rc != 0:
                    raise Exception('Unable to map interface to ipaddress') 

                self.nic_to_address_map[(nic, hostname)] = results.stdout.split()[0].strip()

            for nic in nics:
                tinctest.logger.info("Bringing down %s:%s ..." % (hostname, nic))   
                cmd = Command(name='bring NIC down', cmdStr='sudo /sbin/ifdown %s' % nic, ctxt=REMOTE, remoteHost=hostname)
                pool.addCommand(cmd)

            pool.join()
            for cmd in pool.getCompletedItems():
                results = cmd.get_results()
                if results.rc != 0:
                    return False
        finally:
            pool.haltWork()
            pool.joinWorkers()
            pool.join()

        return True
Ejemplo n.º 17
0
class WorkerPoolTest(unittest.TestCase):
    def setUp(self):
        self.pool = WorkerPool(numWorkers=1, logger=mock.Mock())

    def tearDown(self):
        # All background threads must be stopped, or else the test runner will
        # hang waiting. Join the stopped threads to make sure we're completely
        # clean for the next test.
        self.pool.haltWork()
        self.pool.joinWorkers()

    def test_pool_must_have_some_workers(self):
        with self.assertRaises(Exception):
            WorkerPool(numWorkers=0)
        
    def test_pool_runs_added_command(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        cmd.run.assert_called_once_with()

    def test_completed_commands_are_retrievable(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd) # should quickly be completed
        self.pool.join()

        self.assertEqual(self.pool.getCompletedItems(), [cmd])

    def test_pool_is_not_marked_done_until_commands_finish(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd.run.side_effect = wait_for_event

        self.assertTrue(self.pool.isDone())

        try:
            self.pool.addCommand(cmd)
            self.assertFalse(self.pool.isDone())

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        self.assertTrue(self.pool.isDone())

    def test_pool_can_be_emptied_of_completed_commands(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        self.pool.empty_completed_items()
        self.assertEqual(self.pool.getCompletedItems(), [])

    def test_check_results_succeeds_when_no_items_fail(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)
        self.pool.join()
        self.pool.check_results()

    def test_check_results_throws_exception_at_first_failure(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return False to simulate a
        # failure.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = False

        self.pool.addCommand(cmd)
        self.pool.join()

        with self.assertRaises(ExecutionError):
            self.pool.check_results()

    def test_join_with_timeout_returns_done_immediately_if_there_is_nothing_to_do(self):
        start = time.time()
        done = self.pool.join(10)
        delta = time.time() - start

        self.assertTrue(done)

        # "Returns immediately" is a difficult thing to test. Longer than two
        # seconds seems like a reasonable failure case, even on a heavily loaded
        # test container.
        self.assertLess(delta, 2)

    def test_join_with_timeout_doesnt_return_done_until_all_commands_complete(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd)

            done = self.pool.join(0.001)
            self.assertFalse(done)

            # Test zero and negative timeouts too.
            done = self.pool.join(0)
            self.assertFalse(done)

            done = self.pool.join(-1)
            self.assertFalse(done)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        done = self.pool.join(2) # should be immediate, but there's still a race
        self.assertTrue(done)

    def test_completed_returns_number_of_completed_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.assertEqual(self.pool.completed, 3)

    def test_completed_can_be_cleared_back_to_zero(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()
        self.assertEqual(self.pool.completed, 0)

    def test_completed_is_reset_to_zero_after_getCompletedItems(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.getCompletedItems()
        self.assertEqual(self.pool.completed, 0)

    def test_assigned_returns_number_of_assigned_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.assertEqual(self.pool.assigned, 3)

    def test_assigned_is_decremented_when_completed_items_are_emptied(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_checked(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.check_results()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_popped(self):
        # The first command will finish immediately.
        cmd1 = mock.Mock(spec=Command)
        self.pool.addCommand(cmd1)

        # The other command will wait until we allow it to continue.
        cmd2 = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd2.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd2)
            self.assertEqual(self.pool.assigned, 2)

            # Avoid race flakes; make sure we actually complete the first
            # command.
            while self.pool.completed < 1:
                self.pool.join(0.001)

            # Pop the completed item.
            self.assertEqual(self.pool.getCompletedItems(), [cmd1])

            # Now we should be down to one assigned command.
            self.assertEqual(self.pool.assigned, 1)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        # Pop the other completed item.
        self.assertEqual(self.pool.getCompletedItems(), [cmd2])
        self.assertEqual(self.pool.assigned, 0)

    def test_join_and_indicate_progress_prints_nothing_if_pool_is_done(self):
        stdout = StringIO.StringIO()
        join_and_indicate_progress(self.pool, stdout)

        self.assertEqual(stdout.getvalue(), '')

    def test_join_and_indicate_progress_prints_dots_until_pool_is_done(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd.run.side_effect = wait_for_event

        # Open up a pipe and wrap each end in a file-like object.
        read_end, write_end = os.pipe()
        read_end = os.fdopen(read_end, 'r')
        write_end = os.fdopen(write_end, 'w')

        # Create a thread to perform join_and_indicate_progress().
        def tmain():
            join_and_indicate_progress(self.pool, write_end, interval=0.001)
            write_end.close()
        join_thread = threading.Thread(target=tmain)

        try:
            # Add the command, then join the WorkerPool.
            self.pool.addCommand(cmd)
            join_thread.start()

            # join_and_indicate_progress() is now writing to our pipe. Wait for
            # a few dots...
            for _ in range(3):
                byte = read_end.read(1)
                self.assertEqual(byte, '.')

            # ...then stop the command.
            event.set()

            # Make sure the rest of the output consists of dots ending in a
            # newline. (tmain() closes the write end of the pipe so that this
            # read() will complete.)
            remaining = read_end.read()
            self.assertRegexpMatches(remaining, r'^[.]*\n$')

        finally:
            # Make sure that we unblock and join all threads, even on a test
            # failure.
            event.set()
            join_thread.join()

    def test_join_and_indicate_progress_flushes_every_dot(self):
        duration = 0.005

        cmd = mock.Mock(spec=Command)
        def wait_for_duration():
            time.sleep(duration)
        cmd.run.side_effect = wait_for_duration
        self.pool.addCommand(cmd)

        stdout = mock.Mock(spec=file)
        join_and_indicate_progress(self.pool, stdout, interval=(duration / 5))

        for i, call in enumerate(stdout.mock_calls):
            # Every written dot should be followed by a flush().
            if call == mock.call.write('.'):
                self.assertEqual(stdout.mock_calls[i + 1], mock.call.flush())
Ejemplo n.º 18
0
class WorkerPoolTest(unittest.TestCase):
    def setUp(self):
        self.pool = WorkerPool(numWorkers=1, logger=mock.Mock())

    def tearDown(self):
        # All background threads must be stopped, or else the test runner will
        # hang waiting. Join the stopped threads to make sure we're completely
        # clean for the next test.
        self.pool.haltWork()
        self.pool.joinWorkers()

    def test_pool_must_have_some_workers(self):
        with self.assertRaises(Exception):
            WorkerPool(numWorkers=0)

    def test_pool_runs_added_command(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        cmd.run.assert_called_once_with()

    def test_completed_commands_are_retrievable(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)  # should quickly be completed
        self.pool.join()

        self.assertEqual(self.pool.getCompletedItems(), [cmd])

    def test_pool_is_not_marked_done_until_commands_finish(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd.run.side_effect = wait_for_event

        self.assertTrue(self.pool.isDone())

        try:
            self.pool.addCommand(cmd)
            self.assertFalse(self.pool.isDone())

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        self.assertTrue(self.pool.isDone())

    def test_pool_can_be_emptied_of_completed_commands(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        self.pool.empty_completed_items()
        self.assertEqual(self.pool.getCompletedItems(), [])

    def test_check_results_succeeds_when_no_items_fail(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)
        self.pool.join()
        self.pool.check_results()

    def test_check_results_throws_exception_at_first_failure(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return False to simulate a
        # failure.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = False

        self.pool.addCommand(cmd)
        self.pool.join()

        with self.assertRaises(ExecutionError):
            self.pool.check_results()

    def test_join_with_timeout_returns_done_immediately_if_there_is_nothing_to_do(
            self):
        start = time.time()
        done = self.pool.join(10)
        delta = time.time() - start

        self.assertTrue(done)

        # "Returns immediately" is a difficult thing to test. Longer than two
        # seconds seems like a reasonable failure case, even on a heavily loaded
        # test container.
        self.assertLess(delta, 2)

    def test_join_with_timeout_doesnt_return_done_until_all_commands_complete(
            self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd)

            done = self.pool.join(0.001)
            self.assertFalse(done)

            # Test zero and negative timeouts too.
            done = self.pool.join(0)
            self.assertFalse(done)

            done = self.pool.join(-1)
            self.assertFalse(done)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        done = self.pool.join(
            2)  # should be immediate, but there's still a race
        self.assertTrue(done)

    def test_completed_returns_number_of_completed_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.assertEqual(self.pool.completed, 3)

    def test_completed_can_be_cleared_back_to_zero(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()
        self.assertEqual(self.pool.completed, 0)

    def test_completed_is_reset_to_zero_after_getCompletedItems(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.getCompletedItems()
        self.assertEqual(self.pool.completed, 0)

    def test_assigned_returns_number_of_assigned_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.assertEqual(self.pool.assigned, 3)

    def test_assigned_is_decremented_when_completed_items_are_emptied(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_checked(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.check_results()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_popped(self):
        # The first command will finish immediately.
        cmd1 = mock.Mock(spec=Command)
        self.pool.addCommand(cmd1)

        # The other command will wait until we allow it to continue.
        cmd2 = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd2.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd2)
            self.assertEqual(self.pool.assigned, 2)

            # Avoid race flakes; make sure we actually complete the first
            # command.
            while self.pool.completed < 1:
                self.pool.join(0.001)

            # Pop the completed item.
            self.assertEqual(self.pool.getCompletedItems(), [cmd1])

            # Now we should be down to one assigned command.
            self.assertEqual(self.pool.assigned, 1)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        # Pop the other completed item.
        self.assertEqual(self.pool.getCompletedItems(), [cmd2])
        self.assertEqual(self.pool.assigned, 0)

    def test_join_and_indicate_progress_prints_nothing_if_pool_is_done(self):
        stdout = io.StringIO()
        join_and_indicate_progress(self.pool, stdout)

        self.assertEqual(stdout.getvalue(), '')

    def test_join_and_indicate_progress_prints_dots_until_pool_is_done(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd.run.side_effect = wait_for_event

        # Open up a pipe and wrap each end in a file-like object.
        read_end, write_end = os.pipe()
        read_end = os.fdopen(read_end, 'r')
        write_end = os.fdopen(write_end, 'w')

        # Create a thread to perform join_and_indicate_progress().
        def tmain():
            join_and_indicate_progress(self.pool, write_end, interval=0.001)
            write_end.close()

        join_thread = threading.Thread(target=tmain)

        try:
            # Add the command, then join the WorkerPool.
            self.pool.addCommand(cmd)
            join_thread.start()

            # join_and_indicate_progress() is now writing to our pipe. Wait for
            # a few dots...
            for _ in range(3):
                byte = read_end.read(1)
                self.assertEqual(byte, '.')

            # ...then stop the command.
            event.set()

            # Make sure the rest of the output consists of dots ending in a
            # newline. (tmain() closes the write end of the pipe so that this
            # read() will complete.)
            remaining = read_end.read()
            self.assertRegex(remaining, r'^[.]*\n$')

        finally:
            # Make sure that we unblock and join all threads, even on a test
            # failure.
            event.set()
            join_thread.join()

    def test_join_and_indicate_progress_flushes_every_dot(self):
        duration = 0.005

        cmd = mock.Mock(spec=Command)

        def wait_for_duration():
            time.sleep(duration)

        cmd.run.side_effect = wait_for_duration
        self.pool.addCommand(cmd)

        stdout = mock.Mock(io.StringIO())
        join_and_indicate_progress(self.pool, stdout, interval=(duration / 5))

        for i, call in enumerate(stdout.mock_calls):
            # Every written dot should be followed by a flush().
            if call == mock.call.write('.'):
                self.assertEqual(stdout.mock_calls[i + 1], mock.call.flush())
class ConcurrentFilespaceMoveTestCase(unittest.TestCase):
    """ This test suite tests the scenario of running gpfilespace concurrently while
        trying to move the filespace. 
        The expected behavior is that only one of the processes succeeds and the 
        rest error out."""

    ALREADY_RUNNING_MSG = 'Another instance of gpfilespace is already running!'

    def setUp(self):
        self.pool = None
        self.pool = WorkerPool()

    def tearDown(self):
        if self.pool:
            self.pool.haltWork()
            self.pool.joinWorkers()
            self.pool.join()

    def get_move_filespace_cmd(self,
                               filespace='myfspc',
                               file_type=FileType.TEMPORARY_FILES):
        if file_type == FileType.TEMPORARY_FILES:
            file_type = 'movetempfiles'
        elif file_type == FileType.TRANSACTION_FILES:
            file_type = 'movetransfiles'

        return Command(name='move filespace',
                       cmdStr='gpfilespace --%s %s' % (file_type, filespace))

    def run_concurrently(self, cmd_list):

        for cmd in cmd_list:
            self.pool.addCommand(cmd)
        self.pool.join()

    def check_concurrent_execution_result(self, execution_results):

        succeeded = 0
        for cmd in execution_results:
            results = cmd.get_results().stdout.strip()
            if self.ALREADY_RUNNING_MSG in results:
                continue
            succeeded += 1

        self.assertEqual(succeeded, 1)

    def test00_move_temp_filespace(self):

        cmd_list = [
            self.get_move_filespace_cmd(file_type=FileType.TEMPORARY_FILES)
            for i in range(2)
        ]
        self.run_concurrently(cmd_list)
        self.check_concurrent_execution_result(self.pool.getCompletedItems())

    def test01_move_trans_filespace(self):

        cmd_list = [
            self.get_move_filespace_cmd(file_type=FileType.TRANSACTION_FILES)
            for i in range(2)
        ]
        self.run_concurrently(cmd_list)
        self.check_concurrent_execution_result(self.pool.getCompletedItems())

    def test02_move_temp_and_trans_filespace(self):

        cmd_list = [
            self.get_move_filespace_cmd(file_type=FileType.TEMPORARY_FILES),
            self.get_move_filespace_cmd(file_type=FileType.TRANSACTION_FILES)
        ]
        self.run_concurrently(cmd_list)
        self.check_concurrent_execution_result(self.pool.getCompletedItems())
Ejemplo n.º 20
0
class GpRecoverSegmentProgram:
    #
    # Constructor:
    #
    # @param options the options as returned by the options parser
    #
    def __init__(self, options):
        self.__options = options
        self.__pool = None
        self.logger = logger

        # If user did not specify a value for showProgressInplace and
        # stdout is a tty then send escape sequences to gprecoverseg
        # output. Otherwise do not show progress inplace.
        if self.__options.showProgressInplace is None:
            self.__options.showProgressInplace = sys.stdout.isatty()


    def getProgressMode(self):
        if self.__options.showProgress:
            if self.__options.showProgressInplace:
                progressMode = GpMirrorListToBuild.Progress.INPLACE
            else:
                progressMode = GpMirrorListToBuild.Progress.SEQUENTIAL
        else:
            progressMode = GpMirrorListToBuild.Progress.NONE

        return progressMode


    def outputToFile(self, mirrorBuilder, gpArray, fileName):
        lines = []

        # one entry for each failure
        for mirror in mirrorBuilder.getMirrorsToBuild():
            output_str = ""
            seg = mirror.getFailedSegment()
            addr = canonicalize_address(seg.getSegmentAddress())
            output_str += ('%s|%d|%s' % (addr, seg.getSegmentPort(), seg.getSegmentDataDirectory()))

            seg = mirror.getFailoverSegment()
            if seg is not None:

                output_str += ' '
                addr = canonicalize_address(seg.getSegmentAddress())
                output_str += ('%s|%d|%s' % (
                    addr, seg.getSegmentPort(), seg.getSegmentDataDirectory()))

            lines.append(output_str)
        writeLinesToFile(fileName, lines)

    def getRecoveryActionsBasedOnOptions(self, gpEnv, gpArray):
        if self.__options.rebalanceSegments:
            return GpSegmentRebalanceOperation(gpEnv, gpArray, self.__options.parallelDegree, self.__options.parallelPerHost)
        else:
            instance = RecoveryTripletsFactory.instance(gpArray, self.__options.recoveryConfigFile, self.__options.newRecoverHosts)
            segs = [GpMirrorToBuild(t.failed, t.live, t.failover, self.__options.forceFullResynchronization) for t in instance.getTriplets()]
            return GpMirrorListToBuild(segs, self.__pool, self.__options.quiet,
                                       self.__options.parallelDegree,
                                       instance.getInterfaceHostnameWarnings(),
                                       forceoverwrite=True,
                                       progressMode=self.getProgressMode(),
                                       parallelPerHost=self.__options.parallelPerHost)

    def syncPackages(self, new_hosts):
        # The design decision here is to squash any exceptions resulting from the
        # synchronization of packages. We should *not* disturb the user's attempts to recover.
        try:
            self.logger.info('Syncing Greenplum Database extensions')
            operations = [SyncPackages(host) for host in new_hosts]
            ParallelOperation(operations, self.__options.parallelDegree).run()
            # introspect outcomes
            for operation in operations:
                operation.get_ret()
        except:
            self.logger.exception('Syncing of Greenplum Database extensions has failed.')
            self.logger.warning('Please run gppkg --clean after successful segment recovery.')

    def displayRecovery(self, mirrorBuilder, gpArray):
        self.logger.info('Greenplum instance recovery parameters')
        self.logger.info('---------------------------------------------------------')

        if self.__options.recoveryConfigFile:
            self.logger.info('Recovery from configuration -i option supplied')
        elif self.__options.newRecoverHosts is not None:
            self.logger.info('Recovery type              = Pool Host')
            for h in self.__options.newRecoverHosts:
                self.logger.info('Pool host for recovery     = %s' % h)
        elif self.__options.rebalanceSegments:
            self.logger.info('Recovery type              = Rebalance')
        else:
            self.logger.info('Recovery type              = Standard')

        if self.__options.rebalanceSegments:
            i = 1
            total = len(gpArray.get_unbalanced_segdbs())
            for toRebalance in gpArray.get_unbalanced_segdbs():
                tabLog = TableLogger()
                self.logger.info('---------------------------------------------------------')
                self.logger.info('Unbalanced segment %d of %d' % (i, total))
                self.logger.info('---------------------------------------------------------')
                programIoUtils.appendSegmentInfoForOutput("Unbalanced", gpArray, toRebalance, tabLog)
                tabLog.info(["Balanced role", "= Primary" if toRebalance.preferred_role == 'p' else "= Mirror"])
                tabLog.info(["Current role", "= Primary" if toRebalance.role == 'p' else "= Mirror"])
                tabLog.outputTable()
                i += 1
        else:
            i = 0
            total = len(mirrorBuilder.getMirrorsToBuild())
            for toRecover in mirrorBuilder.getMirrorsToBuild():
                self.logger.info('---------------------------------------------------------')
                self.logger.info('Recovery %d of %d' % (i + 1, total))
                self.logger.info('---------------------------------------------------------')

                tabLog = TableLogger()

                syncMode = "Full" if toRecover.isFullSynchronization() else "Incremental"
                tabLog.info(["Synchronization mode", "= " + syncMode])
                programIoUtils.appendSegmentInfoForOutput("Failed", gpArray, toRecover.getFailedSegment(), tabLog)
                programIoUtils.appendSegmentInfoForOutput("Recovery Source", gpArray, toRecover.getLiveSegment(),
                                                          tabLog)

                if toRecover.getFailoverSegment() is not None:
                    programIoUtils.appendSegmentInfoForOutput("Recovery Target", gpArray,
                                                              toRecover.getFailoverSegment(), tabLog)
                else:
                    tabLog.info(["Recovery Target", "= in-place"])
                tabLog.outputTable()

                i = i + 1

        self.logger.info('---------------------------------------------------------')

    def __getSimpleSegmentLabel(self, seg):
        addr = canonicalize_address(seg.getSegmentAddress())
        return "%s:%s" % (addr, seg.getSegmentDataDirectory())

    def __displayRecoveryWarnings(self, mirrorBuilder):
        for warning in self._getRecoveryWarnings(mirrorBuilder):
            self.logger.warn(warning)

    def _getRecoveryWarnings(self, mirrorBuilder):
        """
        return an array of string warnings regarding the recovery
        """
        res = []
        for toRecover in mirrorBuilder.getMirrorsToBuild():

            if toRecover.getFailoverSegment() is not None:
                #
                # user specified a failover location -- warn if it's the same host as its primary
                #
                src = toRecover.getLiveSegment()
                dest = toRecover.getFailoverSegment()

                if src.getSegmentHostName() == dest.getSegmentHostName():
                    res.append("Segment is being recovered to the same host as its primary: "
                               "primary %s    failover target: %s"
                               % (self.__getSimpleSegmentLabel(src), self.__getSimpleSegmentLabel(dest)))

        for warning in mirrorBuilder.getAdditionalWarnings():
            res.append(warning)

        return res

    def _get_dblist(self):
        # template0 does not accept any connections so we exclude it
        with closing(dbconn.connect(dbconn.DbURL())) as conn:
            res = dbconn.query(conn, "SELECT datname FROM PG_DATABASE WHERE datname != 'template0'")
            return res.fetchall()

    def run(self):
        if self.__options.parallelDegree < 1 or self.__options.parallelDegree > gp.MAX_COORDINATOR_NUM_WORKERS:
            raise ProgramArgumentValidationException(
                "Invalid parallelDegree value provided with -B argument: %d" % self.__options.parallelDegree)
        if self.__options.parallelPerHost < 1 or self.__options.parallelPerHost > gp.MAX_SEGHOST_NUM_WORKERS:
            raise ProgramArgumentValidationException(
                "Invalid parallelPerHost value provided with -b argument: %d" % self.__options.parallelPerHost)

        self.__pool = WorkerPool(self.__options.parallelDegree)
        gpEnv = GpCoordinatorEnvironment(self.__options.coordinatorDataDirectory, True)

        # verify "where to recover" options
        optionCnt = 0
        if self.__options.newRecoverHosts is not None:
            optionCnt += 1
        if self.__options.recoveryConfigFile is not None:
            optionCnt += 1
        if self.__options.rebalanceSegments:
            optionCnt += 1
        if optionCnt > 1:
            raise ProgramArgumentValidationException("Only one of -i, -p, and -r may be specified")

        faultProberInterface.getFaultProber().initializeProber(gpEnv.getCoordinatorPort())

        confProvider = configInterface.getConfigurationProvider().initializeProvider(gpEnv.getCoordinatorPort())

        gpArray = confProvider.loadSystemConfig(useUtilityMode=False)

        if not gpArray.hasMirrors:
            raise ExceptionNoStackTraceNeeded(
                'GPDB Mirroring replication is not configured for this Greenplum Database instance.')

        num_workers = min(len(gpArray.get_hostlist()), self.__options.parallelDegree)
        hosts = set(gpArray.get_hostlist(includeCoordinator=False))
        unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers)
        update_unreachable_flag_for_segments(gpArray, unreachable_hosts)

        # We have phys-rep/filerep mirrors.

        if self.__options.newRecoverHosts is not None:
            try:
                uniqueHosts = []
                for h in self.__options.newRecoverHosts.split(','):
                    if h.strip() not in uniqueHosts:
                        uniqueHosts.append(h.strip())
                self.__options.newRecoverHosts = uniqueHosts
            except Exception as ex:
                raise ProgramArgumentValidationException( \
                    "Invalid value for recover hosts: %s" % ex)

        # retain list of hosts that were existing in the system prior to getRecoverActions...
        # this will be needed for later calculations that determine whether
        # new hosts were added into the system
        existing_hosts = set(gpArray.getHostList())

        # figure out what needs to be done
        mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray)

        if self.__options.outputSampleConfigFile is not None:
            # just output config file and done
            self.outputToFile(mirrorBuilder, gpArray, self.__options.outputSampleConfigFile)
            self.logger.info('Configuration file output to %s successfully.' % self.__options.outputSampleConfigFile)
        elif self.__options.rebalanceSegments:
            assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation))

            # Make sure we have work to do
            if len(gpArray.get_unbalanced_segdbs()) == 0:
                self.logger.info("No segments are running in their non-preferred role and need to be rebalanced.")
            else:
                self.displayRecovery(mirrorBuilder, gpArray)

                if self.__options.interactive:
                    self.logger.warn("This operation will cancel queries that are currently executing.")
                    self.logger.warn("Connections to the database however will not be interrupted.")
                    if not userinput.ask_yesno(None, "\nContinue with segment rebalance procedure", 'N'):
                        raise UserAbortedException()

                fullRebalanceDone = mirrorBuilder.rebalance()
                self.logger.info("******************************************************************")
                if fullRebalanceDone:
                    self.logger.info("The rebalance operation has completed successfully.")
                else:
                    self.logger.info("The rebalance operation has completed with WARNINGS."
                                     " Please review the output in the gprecoverseg log.")
                self.logger.info("******************************************************************")

        elif len(mirrorBuilder.getMirrorsToBuild()) == 0:
            self.logger.info('No segments to recover')
        else:
            #TODO this already happens in buildMirrors function
            mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray)
            self.validate_heap_checksum_consistency(gpArray, mirrorBuilder)

            self.displayRecovery(mirrorBuilder, gpArray)
            self.__displayRecoveryWarnings(mirrorBuilder)

            if self.__options.interactive:
                if not userinput.ask_yesno(None, "\nContinue with segment recovery procedure", 'N'):
                    raise UserAbortedException()

            # sync packages
            current_hosts = set(gpArray.getHostList())
            new_hosts = current_hosts - existing_hosts
            if new_hosts:
                self.syncPackages(new_hosts)

            contentsToUpdate = [seg.getLiveSegment().getSegmentContentId() for seg in mirrorBuilder.getMirrorsToBuild()]
            update_pg_hba_on_segments(gpArray, self.__options.hba_hostnames, self.__options.parallelDegree, contentsToUpdate)
            if not mirrorBuilder.recover_mirrors(gpEnv, gpArray):
                self.logger.error("gprecoverseg failed. Please check the output for more details.")
                sys.exit(1)

            self.logger.info("********************************")
            self.logger.info("Segments successfully recovered.")
            self.logger.info("********************************")

            self.logger.info("Recovered mirror segments need to sync WAL with primary segments.")
            self.logger.info("Use 'gpstate -e' to check progress of WAL sync remaining bytes")

        sys.exit(0)

    def validate_heap_checksum_consistency(self, gpArray, mirrorBuilder):
        live_segments = [target.getLiveSegment() for target in mirrorBuilder.getMirrorsToBuild()]
        if len(live_segments) == 0:
            self.logger.info("No checksum validation necessary when there are no segments to recover.")
            return

        heap_checksum = HeapChecksum(gpArray, num_workers=min(self.__options.parallelDegree, len(live_segments)), logger=self.logger)
        successes, failures = heap_checksum.get_segments_checksum_settings(live_segments)
        # go forward if we have at least one segment that has replied
        if len(successes) == 0:
            raise Exception("No segments responded to ssh query for heap checksum validation.")
        consistent, inconsistent, coordinator_checksum_value = heap_checksum.check_segment_consistency(successes)
        if len(inconsistent) > 0:
            self.logger.fatal("Heap checksum setting differences reported on segments")
            self.logger.fatal("Failed checksum consistency validation:")
            for gpdb in inconsistent:
                segment_name = gpdb.getSegmentHostName()
                checksum = gpdb.heap_checksum
                self.logger.fatal("%s checksum set to %s differs from coordinator checksum set to %s" %
                                  (segment_name, checksum, coordinator_checksum_value))
            raise Exception("Heap checksum setting differences reported on segments")
        self.logger.info("Heap checksum setting is consistent between coordinator and the segments that are candidates "
                         "for recoverseg")

    def cleanup(self):
        if self.__pool:
            self.__pool.haltWork()  # \  MPP-13489, CR-2572
            self.__pool.joinWorkers()  # > all three of these appear necessary
            self.__pool.join()  # /  see MPP-12633, CR-2252 as well

    # -------------------------------------------------------------------------

    @staticmethod
    def createParser():

        description = ("Recover a failed segment")
        help = [""]

        parser = OptParser(option_class=OptChecker,
                           description=' '.join(description.split()),
                           version='%prog version $Revision$')
        parser.setHelp(help)

        loggingGroup = addStandardLoggingAndHelpOptions(parser, True)
        loggingGroup.add_option("-s", None, default=None, action='store_false',
                                dest='showProgressInplace',
                                help='Show pg_basebackup/pg_rewind progress sequentially instead of inplace')
        loggingGroup.add_option("--no-progress",
                                dest="showProgress", default=True, action="store_false",
                                help="Suppress pg_basebackup/pg_rewind progress output")

        addTo = OptionGroup(parser, "Connection Options")
        parser.add_option_group(addTo)
        addCoordinatorDirectoryOptionForSingleClusterProgram(addTo)

        addTo = OptionGroup(parser, "Recovery Source Options")
        parser.add_option_group(addTo)
        addTo.add_option("-i", None, type="string",
                         dest="recoveryConfigFile",
                         metavar="<configFile>",
                         help="Recovery configuration file")
        addTo.add_option("-o", None,
                         dest="outputSampleConfigFile",
                         metavar="<configFile>", type="string",
                         help="Sample configuration file name to output; "
                              "this file can be passed to a subsequent call using -i option")

        addTo = OptionGroup(parser, "Recovery Destination Options")
        parser.add_option_group(addTo)
        addTo.add_option("-p", None, type="string",
                         dest="newRecoverHosts",
                         metavar="<targetHosts>",
                         help="Spare new hosts to which to recover segments")

        addTo = OptionGroup(parser, "Recovery Options")
        parser.add_option_group(addTo)
        addTo.add_option('-F', None, default=False, action='store_true',
                         dest="forceFullResynchronization",
                         metavar="<forceFullResynchronization>",
                         help="Force full segment resynchronization")
        addTo.add_option("-B", None, type="int", default=gp.DEFAULT_COORDINATOR_NUM_WORKERS,
                         dest="parallelDegree",
                         metavar="<parallelDegree>",
                         help="Max number of hosts to operate on in parallel. Valid values are: 1-%d"
                              % gp.MAX_COORDINATOR_NUM_WORKERS)
        addTo.add_option("-b", None, type="int", default=gp.DEFAULT_SEGHOST_NUM_WORKERS,
                         dest="parallelPerHost",
                         metavar="<parallelPerHost>",
                         help="Max number of segments per host to operate on in parallel. Valid values are: 1-%d"
                              % gp.MAX_SEGHOST_NUM_WORKERS)

        addTo.add_option("-r", None, default=False, action='store_true',
                         dest='rebalanceSegments', help='Rebalance synchronized segments.')
        addTo.add_option('', '--hba-hostnames', action='store_true', dest='hba_hostnames',
                         help='use hostnames instead of CIDR in pg_hba.conf')

        parser.set_defaults()
        return parser

    @staticmethod
    def createProgram(options, args):
        if len(args) > 0:
            raise ProgramArgumentValidationException("too many arguments: only options may be specified", True)
        return GpRecoverSegmentProgram(options)

    @staticmethod
    def mainOptions():
        """
        The dictionary this method returns instructs the simple_main framework
        to check for a gprecoverseg.lock file under COORDINATOR_DATA_DIRECTORY
        to prevent the customer from trying to run more than one instance of
        gprecoverseg at the same time.
        """
        return {'pidlockpath': 'gprecoverseg.lock', 'parentpidvar': 'GPRECOVERPID'}
Ejemplo n.º 21
0
class GpRecoverSegmentProgram:
    #
    # Constructor:
    #
    # @param options the options as returned by the options parser
    #
    def __init__(self, options):
        self.__options = options
        self.__pool = None
        self.logger = logger

        # If user did not specify a value for showProgressInplace and
        # stdout is a tty then send escape sequences to gprecoverseg
        # output. Otherwise do not show progress inplace.
        if self.__options.showProgressInplace is None:
            self.__options.showProgressInplace = sys.stdout.isatty()

    def getProgressMode(self):
        if self.__options.showProgress:
            if self.__options.showProgressInplace:
                progressMode = GpMirrorListToBuild.Progress.INPLACE
            else:
                progressMode = GpMirrorListToBuild.Progress.SEQUENTIAL
        else:
            progressMode = GpMirrorListToBuild.Progress.NONE

        return progressMode

    def outputToFile(self, mirrorBuilder, gpArray, fileName):
        lines = []

        # one entry for each failure
        for mirror in mirrorBuilder.getMirrorsToBuild():
            output_str = ""
            seg = mirror.getFailedSegment()
            addr = canonicalize_address(seg.getSegmentAddress())
            output_str += (
                '%s|%d|%s' %
                (addr, seg.getSegmentPort(), seg.getSegmentDataDirectory()))

            seg = mirror.getFailoverSegment()
            if seg is not None:

                output_str += ' '
                addr = canonicalize_address(seg.getSegmentAddress())
                output_str += ('%s|%d|%s' % (addr, seg.getSegmentPort(),
                                             seg.getSegmentDataDirectory()))

            lines.append(output_str)
        writeLinesToFile(fileName, lines)

    def _getParsedRow(self, filename, lineno, line):
        groups = line.split()  # NOT line.split(' ') due to MPP-15675
        if len(groups) not in [1, 2]:
            msg = "line %d of file %s: expected 1 or 2 groups but found %d" % (
                lineno, filename, len(groups))
            raise ExceptionNoStackTraceNeeded(msg)
        parts = groups[0].split('|')
        if len(parts) != 3:
            msg = "line %d of file %s: expected 3 parts on failed segment group, obtained %d" % (
                lineno, filename, len(parts))
            raise ExceptionNoStackTraceNeeded(msg)
        address, port, datadir = parts
        check_values(lineno, address=address, port=port, datadir=datadir)
        row = {
            'failedAddress': address,
            'failedPort': port,
            'failedDataDirectory': datadir,
            'lineno': lineno
        }
        if len(groups) == 2:
            parts2 = groups[1].split('|')
            if len(parts2) != 3:
                msg = "line %d of file %s: expected 3 parts on new segment group, obtained %d" % (
                    lineno, filename, len(parts2))
                raise ExceptionNoStackTraceNeeded(msg)
            address2, port2, datadir2 = parts2
            check_values(lineno,
                         address=address2,
                         port=port2,
                         datadir=datadir2)
            row.update({
                'newAddress': address2,
                'newPort': port2,
                'newDataDirectory': datadir2
            })

        return row

    def getRecoveryActionsFromConfigFile(self, gpArray):
        """
        getRecoveryActionsFromConfigFile

        returns: a tuple (segments in change tracking disabled mode which are unable to recover, GpMirrorListToBuild object
                 containing information of segments which are able to recover)
        """
        filename = self.__options.recoveryConfigFile
        rows = []
        with open(filename) as f:
            for lineno, line in line_reader(f):
                rows.append(self._getParsedRow(filename, lineno, line))

        allAddresses = [
            row["newAddress"] for row in rows if "newAddress" in row
        ]

        failedSegments = []
        failoverSegments = []
        for row in rows:
            # find the failed segment
            failedAddress = row['failedAddress']
            failedPort = row['failedPort']
            failedDataDirectory = normalizeAndValidateInputPath(
                row['failedDataDirectory'], "config file", row['lineno'])
            failedSegment = None
            for segment in gpArray.getDbList():
                if (segment.getSegmentAddress() == failedAddress
                        and str(segment.getSegmentPort()) == failedPort
                        and segment.getSegmentDataDirectory()
                        == failedDataDirectory):

                    if failedSegment is not None:
                        # this could be an assertion -- configuration should not allow multiple entries!
                        raise Exception((
                            "A segment to recover was found twice in configuration.  "
                            "This segment is described by address|port|directory '%s|%s|%s' "
                            "on the input line: %s") %
                                        (failedAddress, failedPort,
                                         failedDataDirectory, row['lineno']))
                    failedSegment = segment

            if failedSegment is None:
                raise Exception("A segment to recover was not found in configuration.  " \
                                "This segment is described by address|port|directory '%s|%s|%s' on the input line: %s" %
                                (failedAddress, failedPort, failedDataDirectory, row['lineno']))

            failoverSegment = None
            if "newAddress" in row:
                """
                When the second set was passed, the caller is going to tell us to where we need to failover, so
                  build a failover segment
                """
                # these two lines make it so that failoverSegment points to the object that is registered in gparray
                failoverSegment = failedSegment
                failedSegment = failoverSegment.copy()

                address = row["newAddress"]
                try:
                    port = int(row["newPort"])
                except ValueError:
                    raise Exception(
                        'Config file format error, invalid number value in line: %s'
                        % (row['lineno']))

                dataDirectory = normalizeAndValidateInputPath(
                    row["newDataDirectory"], "config file", row['lineno'])
                # FIXME: hostname probably should not be address, but to do so, "hostname" should be added to gpaddmirrors config file
                # FIXME: This appears identical to __getMirrorsToBuildFromConfigFilein clsAddMirrors
                hostName = address

                # now update values in failover segment
                failoverSegment.setSegmentAddress(address)
                failoverSegment.setSegmentHostName(hostName)
                failoverSegment.setSegmentPort(port)
                failoverSegment.setSegmentDataDirectory(dataDirectory)

            # this must come AFTER the if check above because failedSegment can be adjusted to
            #   point to a different object
            failedSegments.append(failedSegment)
            failoverSegments.append(failoverSegment)

        peersForFailedSegments = self.findAndValidatePeersForFailedSegments(
            gpArray, failedSegments)

        segs = []
        segs_with_persistent_mirroring_disabled = []
        for index, failedSegment in enumerate(failedSegments):
            peerForFailedSegment = peersForFailedSegments[index]

            peerForFailedSegmentDbId = peerForFailedSegment.getSegmentDbId()

            if failedSegment.unreachable:
                continue

            segs.append(
                GpMirrorToBuild(failedSegment, peerForFailedSegment,
                                failoverSegments[index],
                                self.__options.forceFullResynchronization))

        self._output_segments_with_persistent_mirroring_disabled(
            segs_with_persistent_mirroring_disabled)

        return GpMirrorListToBuild(
            segs,
            self.__pool,
            self.__options.quiet,
            self.__options.parallelDegree,
            forceoverwrite=True,
            progressMode=self.getProgressMode(),
            parallelPerHost=self.__options.parallelPerHost)

    def findAndValidatePeersForFailedSegments(self, gpArray, failedSegments):
        dbIdToPeerMap = gpArray.getDbIdToPeerMap()
        peersForFailedSegments = [
            dbIdToPeerMap.get(seg.getSegmentDbId()) for seg in failedSegments
        ]

        for i in range(len(failedSegments)):
            peer = peersForFailedSegments[i]
            if peer is None:
                raise Exception("No peer found for dbid %s" %
                                failedSegments[i].getSegmentDbId())
            elif peer.isSegmentDown():
                raise Exception(
                    "Both segments for content %s are down; Try restarting Greenplum DB and running %s again."
                    % (peer.getSegmentContentId(), getProgramName()))
        return peersForFailedSegments

    def getRecoveryActionsFromConfiguration(self, gpEnv, gpArray):
        """
        getRecoveryActionsFromConfiguration

        returns: a tuple (segments in change tracking disabled mode which are unable to recover, GpMirrorListToBuild object
                 containing information of segments which are able to recover)
        """
        segments = gpArray.getSegDbList()

        failedSegments = [seg for seg in segments if seg.isSegmentDown()]
        peersForFailedSegments = self.findAndValidatePeersForFailedSegments(
            gpArray, failedSegments)

        # Dictionaries used for building mapping to new hosts
        recoverAddressMap = {}
        recoverHostMap = {}
        interfaceHostnameWarnings = []

        recoverHostIdx = 0

        if self.__options.newRecoverHosts and len(
                self.__options.newRecoverHosts) > 0:
            for seg in failedSegments:
                segAddress = seg.getSegmentAddress()
                segHostname = seg.getSegmentHostName()

                # Haven't seen this hostname before so we put it on a new host
                if segHostname not in recoverHostMap:
                    try:
                        recoverHostMap[
                            segHostname] = self.__options.newRecoverHosts[
                                recoverHostIdx]
                    except:
                        # If we get here, not enough hosts were specified in the -p option.  Need 1 new host
                        # per 1 failed host.
                        raise Exception(
                            'Not enough new recovery hosts given for recovery.'
                        )
                    recoverHostIdx += 1

                destAddress = recoverHostMap[segHostname]
                destHostname = recoverHostMap[segHostname]

                # Save off the new host/address for this address.
                recoverAddressMap[segAddress] = (destHostname, destAddress)

            new_recovery_hosts = [
                destHostname
                for (destHostname, destAddress) in recoverAddressMap.values()
            ]
            unreachable_hosts = get_unreachable_segment_hosts(
                new_recovery_hosts, len(new_recovery_hosts))
            if unreachable_hosts:
                raise ExceptionNoStackTraceNeeded(
                    "Cannot recover. The recovery target host %s is unreachable."
                    % (' '.join(map(str, unreachable_hosts))))

            for key in list(recoverAddressMap.keys()):
                (newHostname, newAddress) = recoverAddressMap[key]
                try:
                    unix.Ping.local("ping new address", newAddress)
                except:
                    # new address created is invalid, so instead use same hostname for address
                    self.logger.info(
                        "Ping of %s failed, Using %s for both hostname and address.",
                        newAddress, newHostname)
                    newAddress = newHostname
                recoverAddressMap[key] = (newHostname, newAddress)

            if len(self.__options.newRecoverHosts) != recoverHostIdx:
                interfaceHostnameWarnings.append(
                    "The following recovery hosts were not needed:")
                for h in self.__options.newRecoverHosts[recoverHostIdx:]:
                    interfaceHostnameWarnings.append("\t%s" % h)

        portAssigner = PortAssigner(gpArray)

        forceFull = self.__options.forceFullResynchronization

        segs = []
        segs_with_persistent_mirroring_disabled = []
        for i in range(len(failedSegments)):

            failoverSegment = None
            failedSegment = failedSegments[i]
            liveSegment = peersForFailedSegments[i]

            if self.__options.newRecoverHosts and len(
                    self.__options.newRecoverHosts) > 0:
                (newRecoverHost, newRecoverAddress
                 ) = recoverAddressMap[failedSegment.getSegmentAddress()]
                # these two lines make it so that failoverSegment points to the object that is registered in gparray
                failoverSegment = failedSegment
                failedSegment = failoverSegment.copy()
                failoverSegment.unreachable = False  # recover to a new host; it is reachable as checked above.
                failoverSegment.setSegmentHostName(newRecoverHost)
                failoverSegment.setSegmentAddress(newRecoverAddress)
                port = portAssigner.findAndReservePort(newRecoverHost,
                                                       newRecoverAddress)
                failoverSegment.setSegmentPort(port)
            else:
                # we are recovering to the same host("in place") and hence
                # cannot recover if the failed segment is unreachable.
                # This is equivalent to failoverSegment.unreachable that we should be doing here but
                # due to how the code is factored failoverSegment is None here.
                if failedSegment.unreachable:
                    continue

            segs.append(
                GpMirrorToBuild(failedSegment, liveSegment, failoverSegment,
                                forceFull))

        self._output_segments_with_persistent_mirroring_disabled(
            segs_with_persistent_mirroring_disabled)

        return GpMirrorListToBuild(
            segs,
            self.__pool,
            self.__options.quiet,
            self.__options.parallelDegree,
            interfaceHostnameWarnings,
            forceoverwrite=True,
            progressMode=self.getProgressMode(),
            parallelPerHost=self.__options.parallelPerHost)

    def _output_segments_with_persistent_mirroring_disabled(
            self, segs_persistent_mirroring_disabled=None):
        if segs_persistent_mirroring_disabled:
            self.logger.warn(
                'Segments with dbid %s not recovered; persistent mirroring state is disabled.'
                % (', '.join(
                    str(seg_id)
                    for seg_id in segs_persistent_mirroring_disabled)))

    def getRecoveryActionsBasedOnOptions(self, gpEnv, gpArray):
        if self.__options.rebalanceSegments:
            return GpSegmentRebalanceOperation(gpEnv, gpArray,
                                               self.__options.parallelDegree,
                                               self.__options.parallelPerHost)
        elif self.__options.recoveryConfigFile is not None:
            return self.getRecoveryActionsFromConfigFile(gpArray)
        else:
            return self.getRecoveryActionsFromConfiguration(gpEnv, gpArray)

    def syncPackages(self, new_hosts):
        # The design decision here is to squash any exceptions resulting from the
        # synchronization of packages. We should *not* disturb the user's attempts to recover.
        try:
            self.logger.info('Syncing Greenplum Database extensions')
            operations = [SyncPackages(host) for host in new_hosts]
            ParallelOperation(operations, self.__options.parallelDegree).run()
            # introspect outcomes
            for operation in operations:
                operation.get_ret()
        except:
            self.logger.exception(
                'Syncing of Greenplum Database extensions has failed.')
            self.logger.warning(
                'Please run gppkg --clean after successful segment recovery.')

    def displayRecovery(self, mirrorBuilder, gpArray):
        self.logger.info('Greenplum instance recovery parameters')
        self.logger.info(
            '---------------------------------------------------------')

        if self.__options.recoveryConfigFile:
            self.logger.info('Recovery from configuration -i option supplied')
        elif self.__options.newRecoverHosts is not None:
            self.logger.info('Recovery type              = Pool Host')
            for h in self.__options.newRecoverHosts:
                self.logger.info('Pool host for recovery     = %s' % h)
        elif self.__options.rebalanceSegments:
            self.logger.info('Recovery type              = Rebalance')
        else:
            self.logger.info('Recovery type              = Standard')

        if self.__options.rebalanceSegments:
            i = 1
            total = len(gpArray.get_unbalanced_segdbs())
            for toRebalance in gpArray.get_unbalanced_segdbs():
                tabLog = TableLogger()
                self.logger.info(
                    '---------------------------------------------------------'
                )
                self.logger.info('Unbalanced segment %d of %d' % (i, total))
                self.logger.info(
                    '---------------------------------------------------------'
                )
                programIoUtils.appendSegmentInfoForOutput(
                    "Unbalanced", gpArray, toRebalance, tabLog)
                tabLog.info([
                    "Balanced role", "= Primary"
                    if toRebalance.preferred_role == 'p' else "= Mirror"
                ])
                tabLog.info([
                    "Current role",
                    "= Primary" if toRebalance.role == 'p' else "= Mirror"
                ])
                tabLog.outputTable()
                i += 1
        else:
            i = 0
            total = len(mirrorBuilder.getMirrorsToBuild())
            for toRecover in mirrorBuilder.getMirrorsToBuild():
                self.logger.info(
                    '---------------------------------------------------------'
                )
                self.logger.info('Recovery %d of %d' % (i + 1, total))
                self.logger.info(
                    '---------------------------------------------------------'
                )

                tabLog = TableLogger()

                syncMode = "Full" if toRecover.isFullSynchronization(
                ) else "Incremental"
                tabLog.info(["Synchronization mode", "= " + syncMode])
                programIoUtils.appendSegmentInfoForOutput(
                    "Failed", gpArray, toRecover.getFailedSegment(), tabLog)
                programIoUtils.appendSegmentInfoForOutput(
                    "Recovery Source", gpArray, toRecover.getLiveSegment(),
                    tabLog)

                if toRecover.getFailoverSegment() is not None:
                    programIoUtils.appendSegmentInfoForOutput(
                        "Recovery Target", gpArray,
                        toRecover.getFailoverSegment(), tabLog)
                else:
                    tabLog.info(["Recovery Target", "= in-place"])
                tabLog.outputTable()

                i = i + 1

        self.logger.info(
            '---------------------------------------------------------')

    def __getSimpleSegmentLabel(self, seg):
        addr = canonicalize_address(seg.getSegmentAddress())
        return "%s:%s" % (addr, seg.getSegmentDataDirectory())

    def __displayRecoveryWarnings(self, mirrorBuilder):
        for warning in self._getRecoveryWarnings(mirrorBuilder):
            self.logger.warn(warning)

    def _getRecoveryWarnings(self, mirrorBuilder):
        """
        return an array of string warnings regarding the recovery
        """
        res = []
        for toRecover in mirrorBuilder.getMirrorsToBuild():

            if toRecover.getFailoverSegment() is not None:
                #
                # user specified a failover location -- warn if it's the same host as its primary
                #
                src = toRecover.getLiveSegment()
                dest = toRecover.getFailoverSegment()

                if src.getSegmentHostName() == dest.getSegmentHostName():
                    res.append(
                        "Segment is being recovered to the same host as its primary: "
                        "primary %s    failover target: %s" %
                        (self.__getSimpleSegmentLabel(src),
                         self.__getSimpleSegmentLabel(dest)))

        for warning in mirrorBuilder.getAdditionalWarnings():
            res.append(warning)

        return res

    def _get_dblist(self):
        # template0 does not accept any connections so we exclude it
        with closing(dbconn.connect(dbconn.DbURL())) as conn:
            res = dbconn.query(
                conn,
                "SELECT datname FROM PG_DATABASE WHERE datname != 'template0'")
            return res.fetchall()

    def run(self):
        if self.__options.parallelDegree < 1 or self.__options.parallelDegree > gp.MAX_COORDINATOR_NUM_WORKERS:
            raise ProgramArgumentValidationException(
                "Invalid parallelDegree value provided with -B argument: %d" %
                self.__options.parallelDegree)
        if self.__options.parallelPerHost < 1 or self.__options.parallelPerHost > gp.MAX_SEGHOST_NUM_WORKERS:
            raise ProgramArgumentValidationException(
                "Invalid parallelPerHost value provided with -b argument: %d" %
                self.__options.parallelPerHost)

        self.__pool = WorkerPool(self.__options.parallelDegree)
        gpEnv = GpCoordinatorEnvironment(
            self.__options.coordinatorDataDirectory, True)

        # verify "where to recover" options
        optionCnt = 0
        if self.__options.newRecoverHosts is not None:
            optionCnt += 1
        if self.__options.recoveryConfigFile is not None:
            optionCnt += 1
        if self.__options.rebalanceSegments:
            optionCnt += 1
        if optionCnt > 1:
            raise ProgramArgumentValidationException(
                "Only one of -i, -p, and -r may be specified")

        faultProberInterface.getFaultProber().initializeProber(
            gpEnv.getCoordinatorPort())

        confProvider = configInterface.getConfigurationProvider(
        ).initializeProvider(gpEnv.getCoordinatorPort())

        gpArray = confProvider.loadSystemConfig(useUtilityMode=False)

        num_workers = min(len(gpArray.get_hostlist()),
                          self.__options.parallelDegree)
        hosts = set(gpArray.get_hostlist(includeCoordinator=False))
        unreachable_hosts = get_unreachable_segment_hosts(hosts, num_workers)
        for i, segmentPair in enumerate(gpArray.segmentPairs):
            if segmentPair.primaryDB.getSegmentHostName() in unreachable_hosts:
                logger.warning(
                    "Not recovering segment %d because %s is unreachable" %
                    (segmentPair.primaryDB.dbid,
                     segmentPair.primaryDB.getSegmentHostName()))
                gpArray.segmentPairs[i].primaryDB.unreachable = True

            if segmentPair.mirrorDB.getSegmentHostName() in unreachable_hosts:
                logger.warning(
                    "Not recovering segment %d because %s is unreachable" %
                    (segmentPair.mirrorDB.dbid,
                     segmentPair.mirrorDB.getSegmentHostName()))
                gpArray.segmentPairs[i].mirrorDB.unreachable = True

        if not gpArray.hasMirrors:
            raise ExceptionNoStackTraceNeeded(
                'GPDB Mirroring replication is not configured for this Greenplum Database instance.'
            )

        # We have phys-rep/filerep mirrors.

        if self.__options.newRecoverHosts is not None:
            try:
                uniqueHosts = []
                for h in self.__options.newRecoverHosts.split(','):
                    if h.strip() not in uniqueHosts:
                        uniqueHosts.append(h.strip())
                self.__options.newRecoverHosts = uniqueHosts
            except Exception as ex:
                raise ProgramArgumentValidationException( \
                    "Invalid value for recover hosts: %s" % ex)

        # retain list of hosts that were existing in the system prior to getRecoverActions...
        # this will be needed for later calculations that determine whether
        # new hosts were added into the system
        existing_hosts = set(gpArray.getHostList())

        # figure out what needs to be done
        mirrorBuilder = self.getRecoveryActionsBasedOnOptions(gpEnv, gpArray)

        if self.__options.outputSampleConfigFile is not None:
            # just output config file and done
            self.outputToFile(mirrorBuilder, gpArray,
                              self.__options.outputSampleConfigFile)
            self.logger.info('Configuration file output to %s successfully.' %
                             self.__options.outputSampleConfigFile)
        elif self.__options.rebalanceSegments:
            assert (isinstance(mirrorBuilder, GpSegmentRebalanceOperation))

            # Make sure we have work to do
            if len(gpArray.get_unbalanced_segdbs()) == 0:
                self.logger.info(
                    "No segments are running in their non-preferred role and need to be rebalanced."
                )
            else:
                self.displayRecovery(mirrorBuilder, gpArray)

                if self.__options.interactive:
                    self.logger.warn(
                        "This operation will cancel queries that are currently executing."
                    )
                    self.logger.warn(
                        "Connections to the database however will not be interrupted."
                    )
                    if not userinput.ask_yesno(
                            None,
                            "\nContinue with segment rebalance procedure",
                            'N'):
                        raise UserAbortedException()

                fullRebalanceDone = mirrorBuilder.rebalance()
                self.logger.info(
                    "******************************************************************"
                )
                if fullRebalanceDone:
                    self.logger.info(
                        "The rebalance operation has completed successfully.")
                else:
                    self.logger.info(
                        "The rebalance operation has completed with WARNINGS."
                        " Please review the output in the gprecoverseg log.")
                self.logger.info(
                    "******************************************************************"
                )

        elif len(mirrorBuilder.getMirrorsToBuild()) == 0:
            self.logger.info('No segments to recover')
        else:
            mirrorBuilder.checkForPortAndDirectoryConflicts(gpArray)
            self.validate_heap_checksum_consistency(gpArray, mirrorBuilder)

            self.displayRecovery(mirrorBuilder, gpArray)
            self.__displayRecoveryWarnings(mirrorBuilder)

            if self.__options.interactive:
                if not userinput.ask_yesno(
                        None, "\nContinue with segment recovery procedure",
                        'N'):
                    raise UserAbortedException()

            # sync packages
            current_hosts = set(gpArray.getHostList())
            new_hosts = current_hosts - existing_hosts
            if new_hosts:
                self.syncPackages(new_hosts)

            contentsToUpdate = [
                seg.getLiveSegment().getSegmentContentId()
                for seg in mirrorBuilder.getMirrorsToBuild()
            ]
            config_primaries_for_replication(gpArray,
                                             self.__options.hba_hostnames,
                                             contentsToUpdate)
            if not mirrorBuilder.buildMirrors("recover", gpEnv, gpArray):
                sys.exit(1)

            self.trigger_fts_probe(port=gpEnv.getCoordinatorPort())

            self.logger.info("********************************")
            self.logger.info("Segments successfully recovered.")
            self.logger.info("********************************")

        sys.exit(0)

    def trigger_fts_probe(self, port=0):
        self.logger.info('Triggering FTS probe')
        conn = dbconn.connect(dbconn.DbURL(port=port))

        # XXX Perform two probe scans in a row, to work around a known
        # race where gp_request_fts_probe_scan() can return early during the
        # first call. Remove this duplication once that race is fixed.
        for _ in range(2):
            dbconn.execSQL(conn, "SELECT gp_request_fts_probe_scan()")
        conn.close()

    def validate_heap_checksum_consistency(self, gpArray, mirrorBuilder):
        live_segments = [
            target.getLiveSegment()
            for target in mirrorBuilder.getMirrorsToBuild()
        ]
        if len(live_segments) == 0:
            self.logger.info(
                "No checksum validation necessary when there are no segments to recover."
            )
            return

        heap_checksum = HeapChecksum(gpArray,
                                     num_workers=min(
                                         self.__options.parallelDegree,
                                         len(live_segments)),
                                     logger=self.logger)
        successes, failures = heap_checksum.get_segments_checksum_settings(
            live_segments)
        # go forward if we have at least one segment that has replied
        if len(successes) == 0:
            raise Exception(
                "No segments responded to ssh query for heap checksum validation."
            )
        consistent, inconsistent, coordinator_checksum_value = heap_checksum.check_segment_consistency(
            successes)
        if len(inconsistent) > 0:
            self.logger.fatal(
                "Heap checksum setting differences reported on segments")
            self.logger.fatal("Failed checksum consistency validation:")
            for gpdb in inconsistent:
                segment_name = gpdb.getSegmentHostName()
                checksum = gpdb.heap_checksum
                self.logger.fatal(
                    "%s checksum set to %s differs from coordinator checksum set to %s"
                    % (segment_name, checksum, coordinator_checksum_value))
            raise Exception(
                "Heap checksum setting differences reported on segments")
        self.logger.info(
            "Heap checksum setting is consistent between coordinator and the segments that are candidates "
            "for recoverseg")

    def cleanup(self):
        if self.__pool:
            self.__pool.haltWork()  # \  MPP-13489, CR-2572
            self.__pool.joinWorkers()  # > all three of these appear necessary
            self.__pool.join()  # /  see MPP-12633, CR-2252 as well

    # -------------------------------------------------------------------------

    @staticmethod
    def createParser():

        description = ("Recover a failed segment")
        help = [""]

        parser = OptParser(option_class=OptChecker,
                           description=' '.join(description.split()),
                           version='%prog version $Revision$')
        parser.setHelp(help)

        loggingGroup = addStandardLoggingAndHelpOptions(parser, True)
        loggingGroup.add_option(
            "-s",
            None,
            default=None,
            action='store_false',
            dest='showProgressInplace',
            help=
            'Show pg_basebackup/pg_rewind progress sequentially instead of inplace'
        )
        loggingGroup.add_option(
            "--no-progress",
            dest="showProgress",
            default=True,
            action="store_false",
            help="Suppress pg_basebackup/pg_rewind progress output")

        addTo = OptionGroup(parser, "Connection Options")
        parser.add_option_group(addTo)
        addCoordinatorDirectoryOptionForSingleClusterProgram(addTo)

        addTo = OptionGroup(parser, "Recovery Source Options")
        parser.add_option_group(addTo)
        addTo.add_option("-i",
                         None,
                         type="string",
                         dest="recoveryConfigFile",
                         metavar="<configFile>",
                         help="Recovery configuration file")
        addTo.add_option(
            "-o",
            None,
            dest="outputSampleConfigFile",
            metavar="<configFile>",
            type="string",
            help="Sample configuration file name to output; "
            "this file can be passed to a subsequent call using -i option")

        addTo = OptionGroup(parser, "Recovery Destination Options")
        parser.add_option_group(addTo)
        addTo.add_option("-p",
                         None,
                         type="string",
                         dest="newRecoverHosts",
                         metavar="<targetHosts>",
                         help="Spare new hosts to which to recover segments")

        addTo = OptionGroup(parser, "Recovery Options")
        parser.add_option_group(addTo)
        addTo.add_option('-F',
                         None,
                         default=False,
                         action='store_true',
                         dest="forceFullResynchronization",
                         metavar="<forceFullResynchronization>",
                         help="Force full segment resynchronization")
        addTo.add_option(
            "-B",
            None,
            type="int",
            default=gp.DEFAULT_COORDINATOR_NUM_WORKERS,
            dest="parallelDegree",
            metavar="<parallelDegree>",
            help=
            "Max number of hosts to operate on in parallel. Valid values are: 1-%d"
            % gp.MAX_COORDINATOR_NUM_WORKERS)
        addTo.add_option(
            "-b",
            None,
            type="int",
            default=gp.DEFAULT_SEGHOST_NUM_WORKERS,
            dest="parallelPerHost",
            metavar="<parallelPerHost>",
            help=
            "Max number of segments per host to operate on in parallel. Valid values are: 1-%d"
            % gp.MAX_SEGHOST_NUM_WORKERS)

        addTo.add_option("-r",
                         None,
                         default=False,
                         action='store_true',
                         dest='rebalanceSegments',
                         help='Rebalance synchronized segments.')
        addTo.add_option('',
                         '--hba-hostnames',
                         action='store_true',
                         dest='hba_hostnames',
                         help='use hostnames instead of CIDR in pg_hba.conf')

        parser.set_defaults()
        return parser

    @staticmethod
    def createProgram(options, args):
        if len(args) > 0:
            raise ProgramArgumentValidationException(
                "too many arguments: only options may be specified", True)
        return GpRecoverSegmentProgram(options)

    @staticmethod
    def mainOptions():
        """
        The dictionary this method returns instructs the simple_main framework
        to check for a gprecoverseg.lock file under COORDINATOR_DATA_DIRECTORY
        to prevent the customer from trying to run more than one instance of
        gprecoverseg at the same time.
        """
        return {
            'pidlockpath': 'gprecoverseg.lock',
            'parentpidvar': 'GPRECOVERPID'
        }
Ejemplo n.º 22
0
class WorkerPoolTest(unittest.TestCase):
    def setUp(self):
        self.pool = WorkerPool(numWorkers=1, logger=mock.Mock())

    def tearDown(self):
        # All background threads must be stopped, or else the test runner will
        # hang waiting. Join the stopped threads to make sure we're completely
        # clean for the next test.
        self.pool.haltWork()
        self.pool.joinWorkers()

    def test_pool_must_have_some_workers(self):
        with self.assertRaises(Exception):
            WorkerPool(numWorkers=0)

    def test_pool_runs_added_command(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        cmd.run.assert_called_once_with()

    def test_completed_commands_are_retrievable(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)  # should quickly be completed
        self.pool.join()

        self.assertEqual(self.pool.getCompletedItems(), [cmd])

    def test_pool_is_not_marked_done_until_commands_finish(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd.run.side_effect = wait_for_event

        self.assertTrue(self.pool.isDone())

        try:
            self.pool.addCommand(cmd)
            self.assertFalse(self.pool.isDone())

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        self.assertTrue(self.pool.isDone())

    def test_pool_can_be_emptied_of_completed_commands(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        self.pool.empty_completed_items()
        self.assertEqual(self.pool.getCompletedItems(), [])

    def test_check_results_succeeds_when_no_items_fail(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)
        self.pool.join()
        self.pool.check_results()

    def test_check_results_throws_exception_at_first_failure(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return False to simulate a
        # failure.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = False

        self.pool.addCommand(cmd)
        self.pool.join()

        with self.assertRaises(ExecutionError):
            self.pool.check_results()

    def test_join_with_timeout_returns_done_immediately_if_there_is_nothing_to_do(
            self):
        start = time.time()
        done = self.pool.join(10)
        delta = time.time() - start

        self.assertTrue(done)

        # "Returns immediately" is a difficult thing to test. Longer than two
        # seconds seems like a reasonable failure case, even on a heavily loaded
        # test container.
        self.assertLess(delta, 2)

    def test_join_with_timeout_doesnt_return_done_until_all_commands_complete(
            self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd)

            done = self.pool.join(0.001)
            self.assertFalse(done)

            # Test zero and negative timeouts too.
            done = self.pool.join(0)
            self.assertFalse(done)

            done = self.pool.join(-1)
            self.assertFalse(done)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        done = self.pool.join(
            2)  # should be immediate, but there's still a race
        self.assertTrue(done)

    def test_completed_returns_number_of_completed_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.assertEqual(self.pool.completed, 3)

    def test_completed_can_be_cleared_back_to_zero(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()
        self.assertEqual(self.pool.completed, 0)

    def test_completed_is_reset_to_zero_after_getCompletedItems(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.getCompletedItems()
        self.assertEqual(self.pool.completed, 0)

    def test_assigned_returns_number_of_assigned_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.assertEqual(self.pool.assigned, 3)

    def test_assigned_is_decremented_when_completed_items_are_emptied(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_checked(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.check_results()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_popped(self):
        # The first command will finish immediately.
        cmd1 = mock.Mock(spec=Command)
        self.pool.addCommand(cmd1)

        # The other command will wait until we allow it to continue.
        cmd2 = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()

        def wait_for_event():
            event.wait()

        cmd2.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd2)
            self.assertEqual(self.pool.assigned, 2)

            # Avoid race flakes; make sure we actually complete the first
            # command.
            while self.pool.completed < 1:
                self.pool.join(0.001)

            # Pop the completed item.
            self.assertEqual(self.pool.getCompletedItems(), [cmd1])

            # Now we should be down to one assigned command.
            self.assertEqual(self.pool.assigned, 1)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        # Pop the other completed item.
        self.assertEqual(self.pool.getCompletedItems(), [cmd2])
        self.assertEqual(self.pool.assigned, 0)

    def test_join_and_indicate_progress_prints_nothing_if_pool_is_done(self):
        stdout = StringIO.StringIO()
        join_and_indicate_progress(self.pool, stdout)

        self.assertEqual(stdout.getvalue(), '')

    def test_join_and_indicate_progress_prints_dots_until_pool_is_done(self):
        # To avoid false negatives from the race conditions here, let's set up a
        # situation where we'll print ten dots on average, and verify that there
        # were at least five dots printed.
        duration = 0.01

        cmd = mock.Mock(spec=Command)

        def wait_for_duration():
            time.sleep(duration)

        cmd.run.side_effect = wait_for_duration
        self.pool.addCommand(cmd)

        stdout = StringIO.StringIO()
        join_and_indicate_progress(self.pool, stdout, interval=(duration / 10))

        results = stdout.getvalue()
        self.assertIn('.....', results)
        self.assertTrue(results.endswith('\n'))

    def test_join_and_indicate_progress_flushes_every_dot(self):
        # Set up a test scenario like the progress test above.
        duration = 0.005

        cmd = mock.Mock(spec=Command)

        def wait_for_duration():
            time.sleep(duration)

        cmd.run.side_effect = wait_for_duration
        self.pool.addCommand(cmd)

        stdout = mock.Mock(spec=file)
        join_and_indicate_progress(self.pool, stdout, interval=(duration / 5))

        for i, call in enumerate(stdout.mock_calls):
            # Every written dot should be followed by a flush().
            if call == mock.call.write('.'):
                self.assertEqual(stdout.mock_calls[i + 1], mock.call.flush())
Ejemplo n.º 23
0
class WorkerPoolTest(unittest.TestCase):
    def setUp(self):
        self.pool = WorkerPool(numWorkers=1, logger=mock.Mock())

    def tearDown(self):
        # All background threads must be stopped, or else the test runner will
        # hang waiting. Join the stopped threads to make sure we're completely
        # clean for the next test.
        self.pool.haltWork()
        self.pool.joinWorkers()

    def test_pool_must_have_some_workers(self):
        with self.assertRaises(Exception):
            WorkerPool(numWorkers=0)
        
    def test_pool_runs_added_command(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        cmd.run.assert_called_once_with()

    def test_completed_commands_are_retrievable(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd) # should quickly be completed
        self.pool.join()

        self.assertEqual(self.pool.getCompletedItems(), [cmd])

    def test_pool_is_not_marked_done_until_commands_finish(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd.run.side_effect = wait_for_event

        self.assertTrue(self.pool.isDone())

        try:
            self.pool.addCommand(cmd)
            self.assertFalse(self.pool.isDone())

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        self.assertTrue(self.pool.isDone())

    def test_pool_can_be_emptied_of_completed_commands(self):
        cmd = mock.Mock(spec=Command)

        self.pool.addCommand(cmd)
        self.pool.join()

        self.pool.empty_completed_items()
        self.assertEqual(self.pool.getCompletedItems(), [])

    def test_check_results_succeeds_when_no_items_fail(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)
        self.pool.join()
        self.pool.check_results()

    def test_check_results_throws_exception_at_first_failure(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return False to simulate a
        # failure.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = False

        self.pool.addCommand(cmd)
        self.pool.join()

        with self.assertRaises(ExecutionError):
            self.pool.check_results()

    def test_join_with_timeout_returns_done_immediately_if_there_is_nothing_to_do(self):
        start = time.time()
        done = self.pool.join(10)
        delta = time.time() - start

        self.assertTrue(done)

        # "Returns immediately" is a difficult thing to test. Longer than two
        # seconds seems like a reasonable failure case, even on a heavily loaded
        # test container.
        self.assertLess(delta, 2)

    def test_join_with_timeout_doesnt_return_done_until_all_commands_complete(self):
        cmd = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd)

            done = self.pool.join(0.001)
            self.assertFalse(done)

            # Test zero and negative timeouts too.
            done = self.pool.join(0)
            self.assertFalse(done)

            done = self.pool.join(-1)
            self.assertFalse(done)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        done = self.pool.join(2) # should be immediate, but there's still a race
        self.assertTrue(done)

    def test_completed_returns_number_of_completed_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.assertEqual(self.pool.completed, 3)

    def test_completed_can_be_cleared_back_to_zero(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()
        self.assertEqual(self.pool.completed, 0)

    def test_completed_is_reset_to_zero_after_getCompletedItems(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.getCompletedItems()
        self.assertEqual(self.pool.completed, 0)

    def test_assigned_returns_number_of_assigned_commands(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.assertEqual(self.pool.assigned, 3)

    def test_assigned_is_decremented_when_completed_items_are_emptied(self):
        for _ in range(3):
            cmd = mock.Mock(spec=Command)
            self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.empty_completed_items()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_checked(self):
        cmd = mock.Mock(spec=Command)

        # Command.get_results() returns a CommandResult.
        # CommandResult.wasSuccessful() should return True if the command
        # succeeds.
        result = cmd.get_results.return_value
        result.wasSuccessful.return_value = True

        self.pool.addCommand(cmd)

        self.pool.join()
        self.pool.check_results()

        self.assertEqual(self.pool.assigned, 0)

    def test_assigned_is_decremented_when_completed_items_are_popped(self):
        # The first command will finish immediately.
        cmd1 = mock.Mock(spec=Command)
        self.pool.addCommand(cmd1)

        # The other command will wait until we allow it to continue.
        cmd2 = mock.Mock(spec=Command)

        # cmd.run() will block until this Event is set.
        event = threading.Event()
        def wait_for_event():
            event.wait()
        cmd2.run.side_effect = wait_for_event

        try:
            self.pool.addCommand(cmd2)
            self.assertEqual(self.pool.assigned, 2)

            # Avoid race flakes; make sure we actually complete the first
            # command.
            while self.pool.completed < 1:
                self.pool.join(0.001)

            # Pop the completed item.
            self.assertEqual(self.pool.getCompletedItems(), [cmd1])

            # Now we should be down to one assigned command.
            self.assertEqual(self.pool.assigned, 1)

        finally:
            # Make sure that we unblock the thread even on a test failure.
            event.set()

        self.pool.join()

        # Pop the other completed item.
        self.assertEqual(self.pool.getCompletedItems(), [cmd2])
        self.assertEqual(self.pool.assigned, 0)

    def test_join_and_indicate_progress_prints_nothing_if_pool_is_done(self):
        stdout = StringIO.StringIO()
        join_and_indicate_progress(self.pool, stdout)

        self.assertEqual(stdout.getvalue(), '')

    def test_join_and_indicate_progress_prints_dots_until_pool_is_done(self):
        # To avoid false negatives from the race conditions here, let's set up a
        # situation where we'll print ten dots on average, and verify that there
        # were at least five dots printed.
        duration = 0.01

        cmd = mock.Mock(spec=Command)
        def wait_for_duration():
            time.sleep(duration)
        cmd.run.side_effect = wait_for_duration
        self.pool.addCommand(cmd)

        stdout = StringIO.StringIO()
        join_and_indicate_progress(self.pool, stdout, interval=(duration / 10))

        results = stdout.getvalue()
        self.assertIn('.....', results)
        self.assertTrue(results.endswith('\n'))

    def test_join_and_indicate_progress_flushes_every_dot(self):
        # Set up a test scenario like the progress test above.
        duration = 0.005

        cmd = mock.Mock(spec=Command)
        def wait_for_duration():
            time.sleep(duration)
        cmd.run.side_effect = wait_for_duration
        self.pool.addCommand(cmd)

        stdout = mock.Mock(spec=file)
        join_and_indicate_progress(self.pool, stdout, interval=(duration / 5))

        for i, call in enumerate(stdout.mock_calls):
            # Every written dot should be followed by a flush().
            if call == mock.call.write('.'):
                self.assertEqual(stdout.mock_calls[i + 1], mock.call.flush())