def _stop_failed_segments(self, gpEnv): failed_reachable_segments = self._get_failed_reachable_segments() if len(failed_reachable_segments) == 0: return self.__logger.info("Ensuring %d failed segment(s) are stopped" % (len(failed_reachable_segments))) segments = self._get_running_postgres_segments( failed_reachable_segments) segmentByHost = GpArray.getSegmentsByHostName(segments) cmds = [] for hostName, segments in segmentByHost.items(): cmd = gp.GpSegStopCmd("remote segment stop on host '%s'" % hostName, gpEnv.getGpHome(), gpEnv.getGpVersion(), mode='fast', dbs=segments, verbose=gplog.logging_is_verbose(), ctxt=base.REMOTE, remoteHost=hostName, segment_batch_size=self.__parallelPerHost) cmds.append(cmd) # we suppress checking for the error. This is because gpsegstop will actually error # in many cases where the stop is actually done (that is, for example, the segment is # running but slow to shutdown so gpsegstop errors after whacking it with a kill) # # Perhaps we should make it so that it so that is checks if the seg is running and only attempt stop # if it's running? In that case, we could propagate the error # self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, suppressErrorCheck=True)
def __ensureStopped(self, gpEnv, directives): """ @param directives a list of the GpStopSegmentDirectoryDirective values indicating which segments to stop """ if len(directives) == 0: return self.__logger.info("Ensuring %d failed segment(s) are stopped" % (len(directives))) segments = [d.getSegment() for d in directives] segments = self._get_running_postgres_segments(segments) segmentByHost = GpArray.getSegmentsByHostName(segments) cmds = [] for hostName, segments in segmentByHost.iteritems(): cmd = gp.GpSegStopCmd("remote segment stop on host '%s'" % hostName, gpEnv.getGpHome(), gpEnv.getGpVersion(), mode='fast', dbs=segments, verbose=gplog.logging_is_verbose(), ctxt=base.REMOTE, remoteHost=hostName) cmds.append(cmd) # we suppress checking for the error. This is because gpsegstop will actually error # in many cases where the stop is actually done (that is, for example, the segment is # running but slow to shutdown so gpsegstop errors after whacking it with a kill) # # Perhaps we should make it so that it so that is checks if the seg is running and only attempt stop # if it's running? In that case, we could propagate the error # self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "stopping segments", suppressErrorCheck=True)
def _clean_up_failed_segments(self): segments_to_clean_up = [] for toRecover in self.__mirrorsToBuild: is_in_place = toRecover.getFailedSegment( ) is not None and toRecover.getFailoverSegment() is None if is_in_place and toRecover.isFullSynchronization(): segments_to_clean_up.append(toRecover.getFailedSegment()) if len(segments_to_clean_up) == 0: return self.__logger.info("Cleaning files from %d segment(s)" % (len(segments_to_clean_up))) segments_to_clean_up_by_host = GpArray.getSegmentsByHostName( segments_to_clean_up) cmds = [] for hostName, segments_to_clean_up in segments_to_clean_up_by_host.items( ): cmds.append( gp.GpCleanSegmentDirectories( "clean segment directories on %s" % hostName, segments_to_clean_up, gp.REMOTE, hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds)
def checkForPortAndDirectoryConflicts(self, gpArray): """ Check gpArray for internal consistency -- no duplicate ports or directories on the same host, for example A detected problem causes an Exception to be raised """ for hostName, segmentArr in GpArray.getSegmentsByHostName( gpArray.getDbList()).items(): usedPorts = {} usedDataDirectories = {} for segment in segmentArr: # check for port conflict port = segment.getSegmentPort() dbid = segment.getSegmentDbId() if port in usedPorts: raise Exception( "Segment dbid's %s and %s on host %s cannot have the same port %s." % (dbid, usedPorts.get(port), hostName, port)) usedPorts[port] = dbid # check for directory conflict; could improve this by reporting nicer the conflicts path = segment.getSegmentDataDirectory() if path in usedDataDirectories: raise Exception( "Segment dbid's %s and %s on host %s cannot have the same data directory '%s'." % (dbid, usedDataDirectories.get(path), hostName, path)) usedDataDirectories[path] = dbid
def checkForPortAndDirectoryConflicts(self, gpArray): """ Check gpArray for internal consistency -- no duplicate ports or directories on the same host, for example A detected problem causes an Exception to be raised """ for hostName, segmentArr in GpArray.getSegmentsByHostName(gpArray.getDbList()).iteritems(): usedPorts = {} usedDataDirectories = {} for segment in segmentArr: # check for port conflict port = segment.getSegmentPort() dbid = segment.getSegmentDbId() if port in usedPorts: raise Exception( "Segment dbid's %s and %s on host %s cannot have the same port %s." % (dbid, usedPorts.get(port), hostName, port)) usedPorts[port] = dbid # check for directory conflict; could improve this by reporting nicer the conflicts path = segment.getSegmentDataDirectory() if path in usedDataDirectories: raise Exception( "Segment dbid's %s and %s on host %s cannot have the same data directory '%s'." % (dbid, usedDataDirectories.get(path), hostName, path)) usedDataDirectories[path] = dbid
def __ensureSharedMemCleaned(self, gpEnv, directives): """ @param directives a list of the GpStopSegmentDirectoryDirective values indicating which segments to cleanup """ if len(directives) == 0: return logger.info( 'Ensuring that shared memory is cleaned up for stopped segments') segments = [d.getSegment() for d in directives] segmentsByHost = GpArray.getSegmentsByHostName(segments) operation_list = [ RemoteOperation(CleanSharedMem(segments), host=hostName) for hostName, segments in segmentsByHost.items() ] ParallelOperation(operation_list).run() for operation in operation_list: try: operation.get_ret() except Exception as e: logger.warning( 'Unable to clean up shared memory for stopped segments on host (%s)' % operation.host)
def __runStartCommand(self, segments, startMethod, numContentsInCluster, resultOut, gpArray, era): """ Putt results into the resultOut object """ if len(segments) == 0: return if startMethod == START_AS_PRIMARY_OR_MIRROR: logger.info("Commencing parallel primary and mirror segment instance startup, please wait...") else: logger.info("Commencing parallel segment instance startup, please wait...") dbIdToPeerMap = gpArray.getDbIdToPeerMap() mirroringModePreTransition = MIRROR_MODE_MIRRORLESS if startMethod == START_AS_MIRRORLESS else MIRROR_MODE_QUIESCENT # launch the start for hostName, segments in GpArray.getSegmentsByHostName(segments).iteritems(): logger.debug("Dispatching command to start segments on host: %s, " \ "with %s contents in cluster" % (hostName, numContentsInCluster)) pickledTransitionData = None if startMethod == START_AS_PRIMARY_OR_MIRROR: mirroringModePerSegment = [] for seg in segments: modeThisSegment = MIRROR_MODE_PRIMARY if seg.isSegmentPrimary(True) else MIRROR_MODE_MIRROR mirroringModePerSegment.append(modeThisSegment) pickledTransitionData = self.__createPickledTransitionParameters(segments, mirroringModePerSegment, None, dbIdToPeerMap) # # This will call sbin/gpsegstart.py # cmd = gp.GpSegStartCmd("remote segment starts on host '%s'" % hostName, self.__gpHome, segments, self.__gpVersion, mirroringModePreTransition, numContentsInCluster, era, self.master_checksum_value, self.__timeout, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=segments[0].getSegmentAddress(), pickledTransitionData=pickledTransitionData, specialMode=self.__specialMode, wrapper=self.__wrapper, wrapper_args=self.__wrapper_args, parallel=self.__parallel, logfileDirectory=self.logfileDirectory) self.__workerPool.addCommand(cmd) if self.__quiet: self.__workerPool.join() else: base.join_and_indicate_progress(self.__workerPool) # process results self.__processStartOrConvertCommands(resultOut) self.__workerPool.empty_completed_items()
def __ensureStopped(self, gpEnv, directives): """ @param directives a list of the GpStopSegmentDirectoryDirective values indicating which segments to stop """ if len(directives) == 0: return logger.info("Ensuring %d failed segment(s) are stopped" % (len(directives))) segments = [d.getSegment() for d in directives] segments = self._get_running_postgres_segments(segments) segmentByHost = GpArray.getSegmentsByHostName(segments) cmds = [] for hostName, segments in segmentByHost.iteritems(): cmd=gp.GpSegStopCmd("remote segment stop on host '%s'" % hostName, gpEnv.getGpHome(), gpEnv.getGpVersion(), mode='fast', dbs=segments, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=hostName) cmds.append( cmd) # we suppress checking for the error. This is because gpsegstop will actually error # in many cases where the stop is actually done (that is, for example, the segment is # running but slow to shutdown so gpsegstop errors after whacking it with a kill) # # Perhaps we should make it so that it so that is checks if the seg is running and only attempt stop # if it's running? In that case, we could propagate the error # self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "stopping segments", suppressErrorCheck=True)
def __runStartCommand(self, segments, startMethod, numContentsInCluster, resultOut, gpArray, era): """ Putt results into the resultOut object """ if len(segments) == 0: return if startMethod == START_AS_PRIMARY_OR_MIRROR: logger.info("Commencing parallel primary and mirror segment instance startup, please wait...") else: logger.info("Commencing parallel segment instance startup, please wait...") dbIdToPeerMap = gpArray.getDbIdToPeerMap() mirroringModePreTransition = MIRROR_MODE_MIRRORLESS if startMethod == START_AS_MIRRORLESS else MIRROR_MODE_QUIESCENT # launch the start for hostName, segments in GpArray.getSegmentsByHostName(segments).items(): logger.debug("Dispatching command to start segments on host: %s, " \ "with %s contents in cluster" % (hostName, numContentsInCluster)) pickledTransitionData = None if startMethod == START_AS_PRIMARY_OR_MIRROR: mirroringModePerSegment = [] for seg in segments: modeThisSegment = MIRROR_MODE_PRIMARY if seg.isSegmentPrimary(True) else MIRROR_MODE_MIRROR mirroringModePerSegment.append(modeThisSegment) pickledTransitionData = self.__createPickledTransitionParameters(segments, mirroringModePerSegment, None, dbIdToPeerMap) # # This will call sbin/gpsegstart.py # cmd = gp.GpSegStartCmd("remote segment starts on host '%s'" % hostName, self.__gpHome, segments, self.__gpVersion, mirroringModePreTransition, numContentsInCluster, era, self.master_checksum_value, self.__timeout, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=segments[0].getSegmentAddress(), pickledTransitionData=pickledTransitionData, specialMode=self.__specialMode, wrapper=self.__wrapper, wrapper_args=self.__wrapper_args, parallel=self.__parallel, logfileDirectory=self.logfileDirectory) self.__workerPool.addCommand(cmd) if self.__quiet: self.__workerPool.join() else: base.join_and_indicate_progress(self.__workerPool) # process results self.__processStartOrConvertCommands(resultOut) self.__workerPool.empty_completed_items()
def checkForPortAndDirectoryConflicts(self, gpArray): """ Check gpArray for internal consistency -- no duplicate ports or directories on the same host, for example A detected problem causes an Exception to be raised """ for hostName, segmentArr in GpArray.getSegmentsByHostName( gpArray.getDbList()).iteritems(): usedPorts = {} usedDataDirectories = {} for segment in segmentArr: # check for port conflict replicationPort = segment.getSegmentReplicationPort() port = segment.getSegmentPort() dbid = segment.getSegmentDbId() if port in usedPorts: raise Exception( "On host %s, port %s for segment with dbid %s conflicts with port for segment dbid %s" % (hostName, port, dbid, usedPorts.get(port))) if segment.isSegmentQE(): if replicationPort is None: raise Exception( "On host %s, the replication port is not set for segment with dbid %s" % (hostName, dbid)) if replicationPort in usedPorts: raise Exception( "On host %s, replication port %s for segment with dbid %s conflicts " "with a port for segment dbid %s" % (hostName, dbid, replicationPort, usedPorts.get(replicationPort))) if port == replicationPort: raise Exception( "On host %s, segment with dbid %s has equal port and replication port" % (hostName, dbid)) usedPorts[port] = dbid usedPorts[replicationPort] = dbid # check for directory conflict; could improve this by reporting nicer the conflicts paths = [ path for oid, path in segment.getSegmentFilespaces().items() if oid != gparray.SYSTEM_FILESPACE ] paths.append(segment.getSegmentDataDirectory()) for path in paths: if path in usedDataDirectories: raise Exception( "On host %s, directory (base or filespace) for segment with dbid %s conflicts with a " "directory (base or filespace) for segment dbid %s; directory: %s" % (hostName, dbid, usedDataDirectories.get(path), path)) usedDataDirectories[path] = dbid
def checkForPortAndDirectoryConflicts(self, gpArray): """ Check gpArray for internal consistency -- no duplicate ports or directories on the same host, for example A detected problem causes an Exception to be raised """ for hostName, segmentArr in GpArray.getSegmentsByHostName(gpArray.getDbList()).iteritems(): usedPorts = {} usedDataDirectories = {} for segment in segmentArr: # check for port conflict replicationPort = segment.getSegmentReplicationPort() port = segment.getSegmentPort() dbid = segment.getSegmentDbId() if port in usedPorts: raise Exception( "On host %s, a port for segment with dbid %s conflicts with a port for segment dbid %s" % (hostName, dbid, usedPorts.get(port)) ) if segment.isSegmentQE(): if replicationPort is not None: raise Exception( "On host %s, the replication port is set for segment with dbid %s" % (hostName, dbid) ) if replicationPort in usedPorts: raise Exception( "On host %s, a port for segment with dbid %s conflicts with a port for segment dbid %s" % (hostName, dbid, usedPorts.get(replicationPort)) ) if port == replicationPort: raise Exception( "On host %s, segment with dbid %s has equal port and replication port" % (hostName, dbid) ) usedPorts[port] = dbid if replicationPort is not None: usedPorts[replicationPort] = dbid # check for directory conflict; could improve this by reporting nicer the conflicts paths = [ path for oid, path in segment.getSegmentFilespaces().items() if oid != gparray.SYSTEM_FILESPACE ] paths.append(segment.getSegmentDataDirectory()) for path in paths: if path in usedDataDirectories and 0: raise Exception( "On host %s, directory (base or filespace) for segment with dbid %s conflicts with a " "directory (base or filespace) for segment dbid %s; directory: %s" % (hostName, dbid, usedDataDirectories.get(path), path) ) usedDataDirectories[path] = dbid
def __sendPrimaryMirrorTransition(self, targetMode, segments, convertUsingFullResync, gpArray, resultOut): """ @param segments the segments to convert @param convertUsingFullResync in parallel with segments, may be None, gives true/false for whether fullResync flag should be passed to the transition """ if len(segments) == 0: logger.debug("%s conversion of zero segments...skipping" % targetMode) return logger.info( "Commencing parallel %s conversion of %s segments, please wait..." % (targetMode, len(segments))) ############################################### # for each host, create + transfer the transition arguments file dispatchCount = 0 dbIdToPeerMap = gpArray.getDbIdToPeerMap() segmentsByHostName = GpArray.getSegmentsByHostName(segments) for hostName, segments in segmentsByHostName.iteritems(): assert len(segments) > 0 logger.debug( "Dispatching command to convert segments on host: %s " % (hostName)) targetModePerSegment = [targetMode for seg in segments] pickledParams = self.__createPickledTransitionParameters( segments, targetModePerSegment, convertUsingFullResync, dbIdToPeerMap) address = segments[0].getSegmentAddress() cmd = gp.GpSegChangeMirrorModeCmd( "remote segment mirror mode conversion on host '%s' using address '%s'" % (hostName, address), self.__gpHome, self.__localeData, self.__gpVersion, segments, targetMode, pickledParams, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=address) self.__workerPool.addCommand(cmd) dispatchCount += 1 self.__workerPool.wait_and_printdots(dispatchCount, self.__quiet) # process results self.__processStartOrConvertCommands(resultOut) self.__workerPool.empty_completed_items()
def __cleanUpSegmentDirectories(self, directives): if len(directives) == 0: return self.__logger.info("Cleaning files from %d segment(s)" % (len(directives))) segments = [d.getSegment() for d in directives] segmentByHost = GpArray.getSegmentsByHostName(segments) cmds = [] for hostName, segments in segmentByHost.iteritems(): cmds.append(gp.GpCleanSegmentDirectories("clean segment directories on %s" % hostName, segments, gp.REMOTE, hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "cleaning existing directories")
def __cleanUpSegmentDirectories(self, directives): if len(directives) == 0: return logger.info("Cleaning files from %d segment(s)" % (len(directives))) segments = [d.getSegment() for d in directives] segmentByHost = GpArray.getSegmentsByHostName(segments) cmds = [] for hostName, segments in segmentByHost.iteritems(): cmds.append( gp.GpCleanSegmentDirectories("clean segment directories on %s" % hostName, \ segments, gp.REMOTE, hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "cleaning existing directories")
def __init__(self, gpArray): # # determine port information for recovering to a new host -- # we need to know the ports that are in use and the valid range of ports # segments = gpArray.getDbList() ports = [seg.getSegmentPort() for seg in segments if seg.isSegmentQE()] if len(ports) > 0: self.__minPort = min(ports) else: raise Exception("No segment ports found in array.") self.__usedPortsByHostName = {} byHost = GpArray.getSegmentsByHostName(segments) for hostName, segments in byHost.items(): usedPorts = self.__usedPortsByHostName[hostName] = {} for seg in segments: usedPorts[seg.getSegmentPort()] = True
def __sendPrimaryMirrorTransition(self, targetMode, segments, convertUsingFullResync, gpArray, resultOut): """ @param segments the segments to convert @param convertUsingFullResync in parallel with segments, may be None, gives true/false for whether fullResync flag should be passed to the transition """ if len(segments) == 0: logger.debug("%s conversion of zero segments...skipping" % targetMode) return logger.info("Commencing parallel %s conversion of %s segments, please wait..." % (targetMode, len(segments))) ############################################### # for each host, create + transfer the transition arguments file dispatchCount=0 dbIdToPeerMap = gpArray.getDbIdToPeerMap() segmentsByHostName = GpArray.getSegmentsByHostName(segments) for hostName, segments in segmentsByHostName.iteritems(): assert len(segments) > 0 logger.debug("Dispatching command to convert segments on host: %s " % (hostName)) targetModePerSegment = [targetMode for seg in segments] pickledParams = self.__createPickledTransitionParameters(segments, targetModePerSegment, convertUsingFullResync, dbIdToPeerMap) address = segments[0].getSegmentAddress() cmd=gp.GpSegChangeMirrorModeCmd( "remote segment mirror mode conversion on host '%s' using address '%s'" % (hostName, address), self.__gpHome, self.__localeData, self.__gpVersion, segments, targetMode, pickledParams, verbose=logging_is_verbose(), ctxt=base.REMOTE, remoteHost=address) self.__workerPool.addCommand(cmd) dispatchCount+=1 self.__workerPool.wait_and_printdots(dispatchCount,self.__quiet) # process results self.__processStartOrConvertCommands(resultOut) self.__workerPool.empty_completed_items()
def __updateGpIdFile(self, gpEnv, gpArray, segments): segmentByHost = GpArray.getSegmentsByHostName(segments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(segments) cmds = [] for hostName in segmentByHost.keys(): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) cmd = gp.ConfigureNewSegment("update gpid file", segmentInfo, newSegments=False, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=False, writeGpIdFileOnly=True) cmds.append(cmd) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "writing updated gpid files")
def __updateGpIdFile(self, gpEnv, gpArray, segments): segmentByHost = GpArray.getSegmentsByHostName(segments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(segments) cmds = [] for hostName in segmentByHost.keys(): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) cmd = gp.ConfigureNewSegment("update gpid file", segmentInfo, newSegments=False, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=False, writeGpIdFileOnly=True) cmds.append(cmd) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "writing updated gpid files")
def __ensureSharedMemCleaned(self, gpEnv, directives): """ @param directives a list of the GpStopSegmentDirectoryDirective values indicating which segments to cleanup """ if len(directives) == 0: return logger.info('Ensuring that shared memory is cleaned up for stopped segments') segments = [d.getSegment() for d in directives] segmentsByHost = GpArray.getSegmentsByHostName(segments) operation_list = [RemoteOperation(CleanSharedMem(segments), host=hostName) for hostName, segments in segmentsByHost.items()] ParallelOperation(operation_list).run() for operation in operation_list: try: operation.get_ret() except Exception as e: logger.warning('Unable to clean up shared memory for stopped segments on host (%s)' % operation.host)
def getTriplets(self): def _check_new_hosts(): if len(self.newHosts) > len(failedSegments): self.interfaceHostnameWarnings.append( "The following recovery hosts were not needed:") for h in self.newHosts[len(failedSegments):]: self.interfaceHostnameWarnings.append("\t%s" % h) if len(self.newHosts) < len(failedSegments): raise Exception( 'Not enough new recovery hosts given for recovery.') unreachable_hosts = get_unreachable_segment_hosts( self.newHosts[:len(failedSegments)], len(failedSegments)) if unreachable_hosts: raise ExceptionNoStackTraceNeeded( "Cannot recover. The following recovery target hosts are " "unreachable: %s" % unreachable_hosts) failedSegments = GpArray.getSegmentsByHostName([ seg for seg in self.gpArray.getSegDbList() if seg.isSegmentDown() ]) _check_new_hosts() requests = [] for failedHost, failoverHost in zip(sorted(failedSegments.keys()), self.newHosts): for failed in failedSegments[failedHost]: failoverPort = self.portAssigner.findAndReservePort( failoverHost, failoverHost) req = RecoveryTripletRequest(failed, failoverHost, failoverPort, failed.getSegmentDataDirectory(), True) requests.append(req) return self._convert_requests_to_triplets(requests)
def _sortedSegs(gparray): segs_by_host = GpArray.getSegmentsByHostName(gparray.getSegDbList()) for host in segs_by_host: segs_by_host[host] = sorted(segs_by_host[host], key=lambda seg: seg.getSegmentDbId()) return segs_by_host
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] timeStamp = datetime.datetime.today().strftime('%Y%m%d_%H%M%S') for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() destSegment.progressFile = '%s/pg_basebackup.%s.dbid%s.out' % (gplog.get_logger_dir(), timeStamp, destSegment.getSegmentDbId()) srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, gplog.get_logger_dir(), newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append("Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # Configure a new segment # # Recover segments using gpconfigurenewsegment, which # uses pg_basebackup. gprecoverseg generates a log filename which is # passed to gpconfigurenewsegment as a confinfo parameter. gprecoverseg # tails this file to show recovery progress to the user, and removes the # file when one done. A new file is generated for each run of # gprecoverseg based on a timestamp. # # There is race between when the pg_basebackup log file is created and # when the progress command is run. Thus, the progress command touches # the file to ensure its present before tailing. self.__logger.info('Configuring new segments') cmds = [] progressCmds = [] removeCmds= [] for hostName in destSegmentByHost.keys(): for segment in destSegmentByHost[hostName]: if self.__progressMode != GpMirrorListToBuild.Progress.NONE: progressCmds.append( GpMirrorListToBuild.ProgressCommand("tail the last line of the file", "set -o pipefail; touch -a {0}; tail -1 {0} | tr '\\r' '\\n' | tail -1".format( pipes.quote(segment.progressFile)), segment.getSegmentDbId(), segment.progressFile, ctxt=base.REMOTE, remoteHost=hostName)) removeCmds.append( base.Command("remove file", "rm -f %s" % pipes.quote(segment.progressFile), ctxt=base.REMOTE, remoteHost=hostName)) cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "unpacking basic segment directory", suppressErrorCheck=False, progressCmds=progressCmds) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(removeCmds, "removing pg_basebackup progress logfiles", suppressErrorCheck=False) # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp('copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [d.getSrcSegment() for d in directives] destSegments = [d.getDestSegment() for d in directives] isTargetReusedLocation = [ d.isTargetReusedLocation() for d in directives ] destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment( destSegments, isTargetReusedLocation) logger.info('Building template directory') (tempDir, blankTarFile, tarFileName) = self.__buildTarFileForTransfer(gpEnv, gpArray.master, srcSegments[0], destSegments) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, tarFile=tarFileName, newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly) # # validate directories for target segments # logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) self.__pool.wait_and_printdots(len(cmds), self.__quiet) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append( "Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # copy tar from master to target hosts # logger.info('Copying template directory file') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( gp.RemoteCopy("copy segment tar", blankTarFile, hostName, tarFileName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "building and transferring basic segment directory") # # unpack and configure new segments # logger.info('Configuring new segments') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "unpacking basic segment directory") # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: cmd = Scp('copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break # # Clean up copied tar from each remote host # logger.info('Cleaning files') cmds = [] for hostName, segments in destSegmentByHost.iteritems(): cmds.append( unix.RemoveFiles('remove tar file', tarFileName, ctxt=gp.REMOTE, remoteHost=hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "cleaning up tar file on segment hosts") # # clean up the local temp directory # unix.RemoveFiles.local('remove temp directory', tempDir)
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] timeStamp = datetime.datetime.today().strftime('%Y%m%d_%H%M%S') for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() destSegment.progressFile = '%s/pg_basebackup.%s.dbid%s.out' % ( gplog.get_logger_dir(), timeStamp, destSegment.getSegmentDbId()) srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment( destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, gplog.get_logger_dir(), newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in list(destSegmentByHost.keys()): cmds.append( createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append( "Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # Configure a new segment # # Recover segments using gpconfigurenewsegment, which # uses pg_basebackup. gprecoverseg generates a log filename which is # passed to gpconfigurenewsegment as a confinfo parameter. gprecoverseg # tails this file to show recovery progress to the user, and removes the # file when one done. A new file is generated for each run of # gprecoverseg based on a timestamp. self.__logger.info('Configuring new segments') cmds = [] progressCmds = [] removeCmds = [] for hostName in list(destSegmentByHost.keys()): for segment in destSegmentByHost[hostName]: progressCmd, removeCmd = self.__getProgressAndRemoveCmds( segment.progressFile, segment.getSegmentDbId(), hostName) removeCmds.append(removeCmd) if progressCmd: progressCmds.append(progressCmd) cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "unpacking basic segment directory", suppressErrorCheck=False, progressCmds=progressCmds) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( removeCmds, "removing pg_basebackup progress logfiles", suppressErrorCheck=False) # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join( srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp( 'copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, gplog.get_logger_dir(), newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) if self.__quiet: self.__pool.join() else: base.join_and_indicate_progress(self.__pool) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append("Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # unpack and configure new segments # self.__logger.info('Configuring new segments') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "unpacking basic segment directory") # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp('copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def rebalance(self): # Get the unbalanced primary segments grouped by hostname # These segments are what we will shutdown. logger.info("Getting unbalanced segments") unbalanced_primary_segs = GpArray.getSegmentsByHostName( self.gpArray.get_unbalanced_primary_segdbs()) pool = WorkerPool() count = 0 try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) logger.info("Stopping unbalanced primary segments...") for hostname in unbalanced_primary_segs.keys(): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=REMOTE, remoteHost=hostname, timeout=600) pool.addCommand(cmd) count += 1 pool.wait_and_printdots(count, False) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 if failed_count > 0: logger.warn( "%d segments failed to stop. A full rebalance of the") logger.warn( "system is not possible at this time. Please check the") logger.warn( "log files, correct the problem, and run gprecoverseg -r") logger.warn("again.") logger.info( "gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() # issue a distributed query to make sure we pick up the fault # that we just caused by shutting down segments conn = None try: logger.info("Triggering segment reconfiguration") dburl = dbconn.DbURL() conn = dbconn.connect(dburl) cmd = ReconfigDetectionSQLQueryCommand(conn) pool.addCommand(cmd) pool.wait_and_printdots(1, False) except Exception: # This exception is expected pass finally: if conn: conn.close() # Final step is to issue a recoverseg operation to resync segments logger.info("Starting segment synchronization") cmd = GpRecoverseg("rebalance recoverseg") pool.addCommand(cmd) pool.wait_and_printdots(1, False) except Exception, ex: raise ex
def rebalance(self): self.logger.info("Determining primary and mirror segment pairs to rebalance") # The current implementation of rebalance calls "gprecoverseg -a" below. # Thus, if another balanced pair is not synchronized, or has a down mirror # that pair will be recovered as a side-effect of rebalancing. unbalanced_primary_segs = [] for segmentPair in self.gpArray.segmentPairs: if segmentPair.balanced(): continue if segmentPair.up() and segmentPair.reachable() and segmentPair.synchronized(): unbalanced_primary_segs.append(segmentPair.primaryDB) else: self.logger.warning( "Not rebalancing primary segment dbid %d with its mirror dbid %d because one is either down, unreachable, or not synchronized" \ % (segmentPair.primaryDB.dbid, segmentPair.mirrorDB.dbid)) if not len(unbalanced_primary_segs): self.logger.info("No segments to rebalance") return True unbalanced_primary_segs = GpArray.getSegmentsByHostName(unbalanced_primary_segs) pool = base.WorkerPool(min(len(unbalanced_primary_segs), self.batch_size)) try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) self.logger.info("Stopping unbalanced primary segments...") for hostname in list(unbalanced_primary_segs.keys()): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=base.REMOTE, remoteHost=hostname, timeout=600, segment_batch_size=self.segment_batch_size) pool.addCommand(cmd) base.join_and_indicate_progress(pool) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 allSegmentsStopped = (failed_count == 0) if not allSegmentsStopped: self.logger.warn("%d segments failed to stop. A full rebalance of the" % failed_count) self.logger.warn("system is not possible at this time. Please check the") self.logger.warn("log files, correct the problem, and run gprecoverseg -r") self.logger.warn("again.") self.logger.info("gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() segment_reconfigurer = SegmentReconfigurer(logger=self.logger, worker_pool=pool, timeout=MIRROR_PROMOTION_TIMEOUT) segment_reconfigurer.reconfigure() # Final step is to issue a recoverseg operation to resync segments self.logger.info("Starting segment synchronization") original_sys_args = sys.argv[:] self.logger.info("=============================START ANOTHER RECOVER=========================================") # import here because GpRecoverSegmentProgram and GpSegmentRebalanceOperation have a circular dependency from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram cmd_args = ['gprecoverseg', '-a', '-B', str(self.batch_size), '-b', str(self.segment_batch_size)] sys.argv = cmd_args[:] local_parser = GpRecoverSegmentProgram.createParser() local_options, args = local_parser.parse_args() recover_cmd = GpRecoverSegmentProgram.createProgram(local_options, args) try: recover_cmd.run() except SystemExit as e: if e.code != 0: self.logger.error("Failed to start the synchronization step of the segment rebalance.") self.logger.error("Check the gprecoverseg log file, correct any problems, and re-run") self.logger.error(' '.join(cmd_args)) raise Exception("Error synchronizing.\nError: %s" % str(e)) finally: if recover_cmd: recover_cmd.cleanup() sys.argv = original_sys_args self.logger.info("==============================END ANOTHER RECOVER==========================================") except Exception as ex: raise ex finally: pool.join() pool.haltWork() pool.joinWorkers() signal.signal(signal.SIGINT, signal.default_int_handler) return allSegmentsStopped # if all segments stopped, then a full rebalance was done
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [d.getSrcSegment() for d in directives] destSegments = [d.getDestSegment() for d in directives] isTargetReusedLocation = [d.isTargetReusedLocation() for d in directives] destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(destSegments, isTargetReusedLocation) logger.info("Building template directory") # In GPSQL, we need to create a template and copy it to all of failed segments. if gpArray.getFaultStrategy() == gparray.FAULT_STRATEGY_NONE: tempDir = "/tmp/GPSQL" templateDir = tempDir + "/gpsql_template" + time.strftime("%Y%m%d_%H%M%S") unix.MakeDirectory("create blank directory for segment", templateDir).run(validateAfter=True) unix.Chmod.local("set permissions on template dir", templateDir, "0700") # set perms so postgres can start logger.info("Creating template") srcSegments[0].createTemplate(templateDir) # Don't need log files and gpperfmon files in template. rmCmd = unix.RemoveFiles( "gprecoverseg remove gppermfon data from template", templateDir + "/gpperfmon/data" ) rmCmd.run(validateAfter=True) rmCmd = unix.RemoveFiles("gprecoverseg remove logs from template", templateDir + "/pg_log/*") rmCmd.run(validateAfter=True) # other files not needed rmCmd = unix.RemoveFiles( "gprecoverseg remove postmaster.opt from template", templateDir + "/postmaster.opts" ) rmCmd.run(validateAfter=True) rmCmd = unix.RemoveFiles( "gprecoverseg remove postmaster.pid from template", templateDir + "/postmaster.pid" ) rmCmd.run(validateAfter=True) # template the temporary directories file template_temporary_directories(templateDir, srcSegments[0].content) tarFileName = "gpsqlSegmentTemplate.tar" blankTarFile = tempDir + "/" + tarFileName cmd = gp.CreateTar("gpbuildingmirrorsegment tar segment template", templateDir, blankTarFile) cmd.run(validateAfter=True) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment( cmdLabel, segmentInfo, tarFile=tarFileName, newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, ) # # validate directories for target segments # logger.info("Validating remote directories") cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, "validate blank segments", True)) for cmd in cmds: self.__pool.addCommand(cmd) self.__pool.wait_and_printdots(len(cmds), self.__quiet) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append("Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # copy tar from master to target hosts # logger.info("Copying template directory file") cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(gp.RemoteCopy("copy segment tar", blankTarFile, hostName, tarFileName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "building and transferring basic segment directory") # # unpack and configure new segments # logger.info("Configuring new segments") cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, "configure blank segments", False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "unpacking basic segment directory") # # Clean up copied tar from each remote host # logger.info("Cleaning files") cmds = [] for hostName, segments in destSegmentByHost.iteritems(): cmds.append(unix.RemoveFiles("remove tar file", tarFileName, ctxt=gp.REMOTE, remoteHost=hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "cleaning up tar file on segment hosts") # # clean up the local temp directory # unix.RemoveFiles.local("remove temp directory", tempDir)
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [d.getSrcSegment() for d in directives] destSegments = [d.getDestSegment() for d in directives] isTargetReusedLocation = [d.isTargetReusedLocation() for d in directives] destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment(destSegments, isTargetReusedLocation) logger.info('Building template directory') (tempDir, blankTarFile, tarFileName) = self.__buildTarFileForTransfer(gpEnv, gpArray.master, srcSegments[0], destSegments) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, tarFile=tarFileName, newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly) # # validate directories for target segments # logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) self.__pool.wait_and_printdots(len(cmds), self.__quiet) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append("Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # copy tar from master to target hosts # logger.info('Copying template directory file') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( gp.RemoteCopy("copy segment tar", blankTarFile, hostName, tarFileName )) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "building and transferring basic segment directory") # # unpack and configure new segments # logger.info('Configuring new segments') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append(createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "unpacking basic segment directory") # # Clean up copied tar from each remote host # logger.info('Cleaning files') cmds = [] for hostName, segments in destSegmentByHost.iteritems(): cmds.append(unix.RemoveFiles('remove tar file', tarFileName, ctxt=gp.REMOTE, remoteHost=hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear(cmds, "cleaning up tar file on segment hosts") # # clean up the local temp directory # unix.RemoveFiles.local('remove temp directory', tempDir)
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [] destSegments = [] isTargetReusedLocation = [] for directive in directives: srcSegment = directive.getSrcSegment() destSegment = directive.getDestSegment() destSegment.primaryHostname = srcSegment.getSegmentHostName() destSegment.primarySegmentPort = srcSegment.getSegmentPort() srcSegments.append(srcSegment) destSegments.append(destSegment) isTargetReusedLocation.append(directive.isTargetReusedLocation()) destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment( destSegments, isTargetReusedLocation) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly, forceoverwrite=self.__forceoverwrite) # # validate directories for target segments # self.__logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) self.__pool.wait_and_printdots(len(cmds), self.__quiet) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append( "Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # unpack and configure new segments # self.__logger.info('Configuring new segments') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "unpacking basic segment directory") # # copy dump files from old segment to new segment # for srcSeg in srcSegments: for destSeg in destSegments: if srcSeg.content == destSeg.content: src_dump_dir = os.path.join( srcSeg.getSegmentDataDirectory(), 'db_dumps') cmd = base.Command('check existence of db_dumps directory', 'ls %s' % (src_dump_dir), ctxt=base.REMOTE, remoteHost=destSeg.getSegmentAddress()) cmd.run() if cmd.results.rc == 0: # Only try to copy directory if it exists cmd = Scp( 'copy db_dumps from old segment to new segment', os.path.join(srcSeg.getSegmentDataDirectory(), 'db_dumps*', '*'), os.path.join(destSeg.getSegmentDataDirectory(), 'db_dumps'), srcSeg.getSegmentAddress(), destSeg.getSegmentAddress(), recursive=True) cmd.run(validateAfter=True) break
def __copySegmentDirectories(self, gpEnv, gpArray, directives): """ directives should be composed of GpCopySegmentDirectoryDirective values """ if len(directives) == 0: return srcSegments = [d.getSrcSegment() for d in directives] destSegments = [d.getDestSegment() for d in directives] isTargetReusedLocation = [ d.isTargetReusedLocation() for d in directives ] destSegmentByHost = GpArray.getSegmentsByHostName(destSegments) newSegmentInfo = gp.ConfigureNewSegment.buildSegmentInfoForNewSegment( destSegments, isTargetReusedLocation) logger.info('Building template directory') # In GPSQL, we need to create a template and copy it to all of failed segments. if gpArray.getFaultStrategy() == gparray.FAULT_STRATEGY_NONE: tempDir = '/tmp/GPSQL' templateDir = tempDir + '/gpsql_template' + time.strftime( "%Y%m%d_%H%M%S") unix.MakeDirectory("create blank directory for segment", templateDir).run(validateAfter=True) unix.Chmod.local('set permissions on template dir', templateDir, '0700') # set perms so postgres can start logger.info('Creating template') srcSegments[0].createTemplate(templateDir) # Don't need log files and gpperfmon files in template. rmCmd = unix.RemoveFiles( 'gprecoverseg remove gppermfon data from template', templateDir + '/gpperfmon/data') rmCmd.run(validateAfter=True) rmCmd = unix.RemoveFiles('gprecoverseg remove logs from template', templateDir + '/pg_log/*') rmCmd.run(validateAfter=True) #other files not needed rmCmd = unix.RemoveFiles( 'gprecoverseg remove postmaster.opt from template', templateDir + '/postmaster.opts') rmCmd.run(validateAfter=True) rmCmd = unix.RemoveFiles( 'gprecoverseg remove postmaster.pid from template', templateDir + '/postmaster.pid') rmCmd.run(validateAfter=True) # template the temporary directories file template_temporary_directories(templateDir, srcSegments[0].content) tarFileName = "gpsqlSegmentTemplate.tar" blankTarFile = tempDir + "/" + tarFileName cmd = gp.CreateTar('gpbuildingmirrorsegment tar segment template', templateDir, blankTarFile) cmd.run(validateAfter=True) def createConfigureNewSegmentCommand(hostName, cmdLabel, validationOnly): segmentInfo = newSegmentInfo[hostName] checkNotNone("segmentInfo for %s" % hostName, segmentInfo) return gp.ConfigureNewSegment(cmdLabel, segmentInfo, tarFile=tarFileName, newSegments=True, verbose=gplog.logging_is_verbose(), batchSize=self.__parallelDegree, ctxt=gp.REMOTE, remoteHost=hostName, validationOnly=validationOnly) # # validate directories for target segments # logger.info('Validating remote directories') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( createConfigureNewSegmentCommand(hostName, 'validate blank segments', True)) for cmd in cmds: self.__pool.addCommand(cmd) self.__pool.wait_and_printdots(len(cmds), self.__quiet) validationErrors = [] for item in self.__pool.getCompletedItems(): results = item.get_results() if not results.wasSuccessful(): if results.rc == 1: # stdoutFromFailure = results.stdout.replace("\n", " ").strip() lines = results.stderr.split("\n") for line in lines: if len(line.strip()) > 0: validationErrors.append( "Validation failure on host %s %s" % (item.remoteHost, line)) else: validationErrors.append(str(item)) self.__pool.empty_completed_items() if validationErrors: raise ExceptionNoStackTraceNeeded("\n" + ("\n".join(validationErrors))) # # copy tar from master to target hosts # logger.info('Copying template directory file') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( gp.RemoteCopy("copy segment tar", blankTarFile, hostName, tarFileName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "building and transferring basic segment directory") # # unpack and configure new segments # logger.info('Configuring new segments') cmds = [] for hostName in destSegmentByHost.keys(): cmds.append( createConfigureNewSegmentCommand(hostName, 'configure blank segments', False)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "unpacking basic segment directory") # # Clean up copied tar from each remote host # logger.info('Cleaning files') cmds = [] for hostName, segments in destSegmentByHost.iteritems(): cmds.append( unix.RemoveFiles('remove tar file', tarFileName, ctxt=gp.REMOTE, remoteHost=hostName)) self.__runWaitAndCheckWorkerPoolForErrorsAndClear( cmds, "cleaning up tar file on segment hosts") # # clean up the local temp directory # unix.RemoveFiles.local('remove temp directory', tempDir)
def rebalance(self): # Get the unbalanced primary segments grouped by hostname # These segments are what we will shutdown. self.logger.info("Getting unbalanced segments") unbalanced_primary_segs = GpArray.getSegmentsByHostName( self.gpArray.get_unbalanced_primary_segdbs()) pool = base.WorkerPool() try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) self.logger.info("Stopping unbalanced primary segments...") for hostname in unbalanced_primary_segs.keys(): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=base.REMOTE, remoteHost=hostname, timeout=600) pool.addCommand(cmd) base.join_and_indicate_progress(pool) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 allSegmentsStopped = (failed_count == 0) if not allSegmentsStopped: self.logger.warn( "%d segments failed to stop. A full rebalance of the") self.logger.warn( "system is not possible at this time. Please check the") self.logger.warn( "log files, correct the problem, and run gprecoverseg -r") self.logger.warn("again.") self.logger.info( "gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() segment_reconfigurer = SegmentReconfigurer( logger=self.logger, worker_pool=pool, timeout=MIRROR_PROMOTION_TIMEOUT) segment_reconfigurer.reconfigure() # Final step is to issue a recoverseg operation to resync segments self.logger.info("Starting segment synchronization") original_sys_args = sys.argv[:] try: self.logger.info( "=============================START ANOTHER RECOVER=========================================" ) # import here because GpRecoverSegmentProgram and GpSegmentRebalanceOperation have a circular dependency from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram sys.argv = ['gprecoverseg', '-a'] local_parser = GpRecoverSegmentProgram.createParser() local_options, args = local_parser.parse_args() cmd = GpRecoverSegmentProgram.createProgram( local_options, args) cmd.run() except SystemExit as e: if e.code != 0: self.logger.error( "Failed to start the synchronization step of the segment rebalance." ) self.logger.error( "Check the gprecoverseg log file, correct any problems, and re-run" ) self.logger.error("'gprecoverseg -a'.") raise Exception("Error synchronizing.\nError: %s" % str(e)) finally: if cmd: cmd.cleanup() sys.argv = original_sys_args self.logger.info( "==============================END ANOTHER RECOVER==========================================" ) except Exception, ex: raise ex
def rebalance(self): # Get the unbalanced primary segments grouped by hostname # These segments are what we will shutdown. logger.info("Getting unbalanced segments") unbalanced_primary_segs = GpArray.getSegmentsByHostName(self.gpArray.get_unbalanced_primary_segdbs()) pool = WorkerPool() count = 0 try: # Disable ctrl-c signal.signal(signal.SIGINT,signal.SIG_IGN) logger.info("Stopping unbalanced primary segments...") for hostname in unbalanced_primary_segs.keys(): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=REMOTE, remoteHost=hostname, timeout=600) pool.addCommand(cmd) count+=1 pool.wait_and_printdots(count, False) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count+=1 if failed_count > 0: logger.warn("%d segments failed to stop. A full rebalance of the") logger.warn("system is not possible at this time. Please check the") logger.warn("log files, correct the problem, and run gprecoverseg -r") logger.warn("again.") logger.info("gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() # issue a distributed query to make sure we pick up the fault # that we just caused by shutting down segments conn = None try: logger.info("Triggering segment reconfiguration") dburl = dbconn.DbURL() conn = dbconn.connect(dburl) cmd = ReconfigDetectionSQLQueryCommand(conn) pool.addCommand(cmd) pool.wait_and_printdots(1, False) except Exception: # This exception is expected pass finally: if conn: conn.close() # Final step is to issue a recoverseg operation to resync segments logger.info("Starting segment synchronization") cmd = GpRecoverseg("rebalance recoverseg") pool.addCommand(cmd) pool.wait_and_printdots(1, False) except Exception, ex: raise ex
def rebalance(self): # Get the unbalanced primary segments grouped by hostname # These segments are what we will shutdown. self.logger.info("Getting unbalanced segments") unbalanced_primary_segs = GpArray.getSegmentsByHostName(self.gpArray.get_unbalanced_primary_segdbs()) pool = base.WorkerPool() count = 0 try: # Disable ctrl-c signal.signal(signal.SIGINT, signal.SIG_IGN) self.logger.info("Stopping unbalanced primary segments...") for hostname in unbalanced_primary_segs.keys(): cmd = GpSegStopCmd("stop unbalanced primary segs", self.gpEnv.getGpHome(), self.gpEnv.getGpVersion(), 'fast', unbalanced_primary_segs[hostname], ctxt=base.REMOTE, remoteHost=hostname, timeout=600) pool.addCommand(cmd) count += 1 pool.wait_and_printdots(count, False) failed_count = 0 completed = pool.getCompletedItems() for res in completed: if not res.get_results().wasSuccessful(): failed_count += 1 allSegmentsStopped = (failed_count == 0) if not allSegmentsStopped: self.logger.warn("%d segments failed to stop. A full rebalance of the") self.logger.warn("system is not possible at this time. Please check the") self.logger.warn("log files, correct the problem, and run gprecoverseg -r") self.logger.warn("again.") self.logger.info("gprecoverseg will continue with a partial rebalance.") pool.empty_completed_items() # issue a distributed query to make sure we pick up the fault # that we just caused by shutting down segments conn = None try: self.logger.info("Triggering segment reconfiguration") dburl = dbconn.DbURL() conn = dbconn.connect(dburl) cmd = ReconfigDetectionSQLQueryCommand(conn) pool.addCommand(cmd) pool.wait_and_printdots(1, False) except Exception: # This exception is expected pass finally: if conn: conn.close() # Final step is to issue a recoverseg operation to resync segments self.logger.info("Starting segment synchronization") original_sys_args = sys.argv[:] try: self.logger.info("=============================START ANOTHER RECOVER=========================================") # import here because GpRecoverSegmentProgram and GpSegmentRebalanceOperation have a circular dependency from gppylib.programs.clsRecoverSegment import GpRecoverSegmentProgram sys.argv = ['gprecoverseg', '-a'] local_parser = GpRecoverSegmentProgram.createParser() local_options, args = local_parser.parse_args() cmd = GpRecoverSegmentProgram.createProgram(local_options, args) cmd.run() except SystemExit as e: if e.code != 0: self.logger.error("Failed to start the synchronization step of the segment rebalance.") self.logger.error("Check the gprecoverseg log file, correct any problems, and re-run") self.logger.error("'gprecoverseg -a'.") raise Exception("Error synchronizing.\nError: %s" % str(e)) finally: if cmd: cmd.cleanup() sys.argv = original_sys_args self.logger.info("==============================END ANOTHER RECOVER==========================================") except Exception, ex: raise ex