Ejemplo n.º 1
0
    def _allocateHost(self, corrFileList):
        """
        Allocate host to a correlator based on two criteria:
        1. maximum of parallel processing (i.e. declustering)
        2. data locality

        corrFileList:    A list of file ids belonging to a correlator (a List of strings)
        return:          A tuple, 0 - a host (hostId:port) (string), 1 - a dict, # key - fileId, value - FileLocation

        This function is not thread-safe, it must be sequentially called by a single thread,
        """
        fileLocDict = ngamsJobMWALib.getBestHost(corrFileList,
                                                 self.__hostAllocDict.keys())
        if (not fileLocDict or len(fileLocDict.keys()) == 0):
            nextHost = ngamsJobMWALib.getNextOnlineHost(
                self.__hostAllocDict.keys())
            if (not nextHost):
                logger.warning(
                    'Cannot find a host that is different from what have been allocated for files %s'
                    % str(corrFileList))
                # try a random one that might have been allocated, thus compromising the maximum parallel processing
                return (ngamsJobMWALib.getNextOnlineHost(), {})
            else:
                return (nextHost, {})
        else:
            return (fileLocDict.values()[0]._svrHost, fileLocDict)
Ejemplo n.º 2
0
    def _stageFiles(self, cre):
        """
        cre - CorrTaskResult
        """
        # 1 Check all files' locations, and determines the best host
        self._numIngested = len(self._fileLocDict.keys())
        if (not self._taskExeHost):  # this is re-try
            try:
                self._fileLocDict = ngamsJobMWALib.getBestHost(
                    self.__fileIds, self._blackList)
            except Exception, e:
                cre._errcode = 4
                cre._errmsg = 'Fail to get the best host for file list %s: %s' % (
                    str(self.__fileIds), str(e))
                self.setStatus(STATUS_EXCEPTION)
                dprint(cre._errmsg)
                return cre

            self._numIngested = len(self._fileLocDict.keys())
            if (self._numIngested > 0):
                self._taskExeHost = self._fileLocDict.values()[0]._svrHost
            else:
                self._taskExeHost = ngamsJobMWALib.getNextOnlineHost(
                    self._blackList)

            if (not self._taskExeHost):
                cre._errcode = 7
                cre._errmsg = 'There are no online NGAS servers available'
                self.setStatus(STATUS_EXCEPTION)
                dprint(cre._errmsg)
                return cre
Ejemplo n.º 3
0
class ObsTask(MapReduceTask):
    """
    MWA Observation, the second level of the RTS MRTask tree
    Thus, a RTSJob consists of multiple ObsTasks
    """
    def __init__(self, obsNum, rtsParam, jobParent):
        """
        Constructor

        obsNum:    (string) observation number/id, i.e. the GPS time of each MWA observation
        rtsParam:    RTS job parameters (class RTSJobParam)
        """
        MapReduceTask.__init__(
            self, str(obsNum),
            parent=jobParent)  #in case caller still passes in integer
        self.__completedCorrTasks = 0
        self.__rtsParam = rtsParam
        self.__corrRespQ = Queue()
        self._taskExeHost = ''
        self._ltComltEvent = threading.Event()  # local task complete event
        self._ltDequeueEvent = threading.Event()  # local task dequeue event
        self._localTaskResult = None
        self._timeOut4LT = rtsParam.LT_timeout
        self._progress = -1
        self._blackHostList = {}

    def combine(self, mapOutput):
        """
        Hold each correlator's result, thread safe
        """
        dprint('Observation %s is combined' % self.getId())
        if (mapOutput):
            self.__corrRespQ.put(mapOutput)  # this is thread safe

    def reduce(self):
        """
        Return results of each correlator, each of which corresponds to images of a subband
        """
        dprint('Observation %s is reduced' % self.getId())
        obsTaskRe = ObsTaskResult(self.getId())
        timeout_retry = 0

        imgurlList = []
        while (1):
            corrTaskRe = None
            try:
                corrTaskRe = self.__corrRespQ.get_nowait()
                if (corrTaskRe._errcode == 0):
                    imgurl = corrTaskRe._imgUrl
                    imgurlList.append(imgurl)
                else:
                    pass
            except Empty, e:
                break
            obsTaskRe.merge(corrTaskRe)

        if (len(imgurlList) < 1):
            errmsg = 'Fail to find any image urls from correlators'
            obsTaskRe._errmsg = errmsg
            obsTaskRe._errcode = 4
            self.setStatus(STATUS_EXCEPTION)
            dprint(obsTaskRe._errmsg)
            return obsTaskRe

        # 1. decide an exeHost to do the local reduction task
        self._progress = 0
        while (self._progress == 0):
            self._progress = 1
            host = None

            # TODO
            # for loop the imgurllist until find one that is online
            # if cannot find, just get random one that is online
            urlError = 0
            for imgurl in imgurlList:
                try:
                    host = urlparse(imgurl)
                    if (not host):
                        urlError = 1
                        continue
                    else:
                        if (self._blackHostList.has_key(
                                '%s:%d' % (host.hostname, host.port))):
                            urlError = 1
                            continue
                        ret = ngamsJobMWALib.pingHost(
                            'http://%s:%d/STATUS' % (host.hostname, host.port))
                        if (ret):
                            urlError = 1
                            continue
                        else:
                            urlError = 0
                            break
                except Exception, err:
                    urlError = 1
                    continue

            if (urlError):
                #try another random site
                host = ngamsJobMWALib.getNextOnlineHost()
                if (host):
                    self._taskExeHost = host
                else:
                    obsTaskRe._errcode = 1
                    obsTaskRe._errmsg = 'Failed to find any host to execute local job'
                    self.setStatus(STATUS_EXCEPTION)
                    break
            else:
                self._taskExeHost = '%s:%d' % (host.hostname, host.port)

            #  2. construct the local TaskId (jobId__obsNum)
            taskId = '%s__%s' % (self.getParent().getId(), self.getId())
            obsLT = ObsLocalTask(taskId, imgurlList, self.__rtsParam)

            # 3. register the local task
            ngamsJobMWALib.registerLocalTask(taskId, self)

            # 4. - do the real reduction work (i.e. combine all images from correlators into a single one) at a remote node
            strLT = pickle.dumps(obsLT)
            try:
                strRes = urllib2.urlopen('http://%s/RUNTASK' %
                                         self._taskExeHost,
                                         data=strLT,
                                         timeout=15).read()
                logger.debug(
                    'Submit local task, acknowledgement received: %s' % strRes)
            except urllib2.URLError, urlerr:
                if (str(urlerr).find('Connection refused') >
                        -1):  # the host is down
                    #TODO - make it a log!
                    logger.info(
                        'The original host %s is down, changing to another host to download all image files...'
                        % self._taskExeHost)
                    self._progress = 0
                    self._taskExeHost = None
                    self._localTaskResult = None
                    self._blackHostList[self._taskExeHost] = 1
                    continue  # the current host is down, change to another host, and redo file staging
                else:
                    errmsg = 'Fail to schedule obs reduction task on %s: %s' % (
                        self._taskExeHost, str(urlerr))
                    obsTaskRe._errmsg = errmsg
                    obsTaskRe._errcode = 2
                    self.setStatus(STATUS_EXCEPTION)
                    dprint(obsTaskRe._errmsg)
                    break
Ejemplo n.º 4
0
    def _stageFiles(self, cre):
        """
        cre - CorrTaskResult
        """
        # 1 Check all files' locations, and determines the best host
        self._numIngested = len(self._fileLocDict.keys())
        if (not self._taskExeHost):  # this is re-try
            try:
                self._fileLocDict = ngamsJobMWALib.getBestHost(
                    self.__fileIds, self._blackList)
            except Exception as e:
                cre._errcode = 4
                cre._errmsg = 'Fail to get the best host for file list %s: %s' % (
                    str(self.__fileIds), str(e))
                self.setStatus(STATUS_EXCEPTION)
                dprint(cre._errmsg)
                return cre

            self._numIngested = len(self._fileLocDict.keys())
            if (self._numIngested > 0):
                self._taskExeHost = self._fileLocDict.values()[0]._svrHost
            else:
                self._taskExeHost = ngamsJobMWALib.getNextOnlineHost(
                    self._blackList)

            if (not self._taskExeHost):
                cre._errcode = 7
                cre._errmsg = 'There are no online NGAS servers available'
                self.setStatus(STATUS_EXCEPTION)
                dprint(cre._errmsg)
                return cre

        # 2. For those files that are not on the best host, check if they are inside the cluster
        #    If so, stage them from an cluster node, otherwise, stage them from the external archive
        frmExtList = []
        for fid in self.__fileIds:
            if (not self._fileLocDict.has_key(fid)):
                try:
                    fileLoc = ngamsJobMWALib.getFileLocations(fid)
                except Exception as e:
                    cre._errmsg = "Fail to get location for file '%s': %s" % (
                        fid, str(e))
                    cre._errcode = 2
                    dprint(cre._errmsg)
                    # most likely a DB error
                if (len(fileLoc) == 0 or cre._errcode == 2):
                    # not in the cluster/or some db error , stage from outside
                    frmExtList.append(fid)
                    cre._errcode = 0  # reset the error code
                else:
                    stageerr = 0
                    for i in range(len(fileLoc)):
                        # record its actual location inside the cluster
                        self._fileLocDict[fid] = fileLoc[i]  # get the host
                        # stage from that host within the cluster
                        stageerr = ngamsJobMWALib.stageFile(
                            [fid], self, self._taskExeHost,
                            fileLoc[0]._svrHost)
                        if (0 == stageerr):
                            break
                    if (stageerr):
                        # if all cluster nodes failed, try the external archive
                        frmExtList.append(fid)

        if (len(frmExtList) > 0):
            logger.debug('Staging files %s from Cortex' % str(frmExtList))
            stageerr = ngamsJobMWALib.stageFile(frmExtList, self,
                                                self._taskExeHost)
            if (stageerr):
                if (ERROR_ST_LTADOWN == stageerr):
                    cre._errmsg = "Fail to stage files because Cortex is down!"
                else:
                    cre._errmsg = "Fail to stage files %s from the external archive to %s. Stage errorcode = %d" % (
                        frmExtList, self._taskExeHost, stageerr)
                cre._errcode = stageerr
                self.setStatus(STATUS_EXCEPTION)
                logger.error(cre._errmsg)
                return cre

        if (self._numIngested == len(self.__fileIds)):  # all files are there
            self._fileIngEvent.set()  # so do not block
        else:
            self._fileIngEvent.clear()

        return cre
Ejemplo n.º 5
0
    def reduce(self):
        """
        Return results of each correlator, each of which corresponds to images of a subband
        """
        dprint('Observation %s is reduced' % self.getId())
        obsTaskRe = ObsTaskResult(self.getId())
        timeout_retry = 0

        imgurlList = []
        while (1):
            corrTaskRe = None
            try:
                corrTaskRe = self.__corrRespQ.get_nowait()
                if (corrTaskRe._errcode == 0):
                    imgurl = corrTaskRe._imgUrl
                    imgurlList.append(imgurl)
                else:
                    pass
            except Empty:
                break
            obsTaskRe.merge(corrTaskRe)

        if (len(imgurlList) < 1):
            errmsg = 'Fail to find any image urls from correlators'
            obsTaskRe._errmsg = errmsg
            obsTaskRe._errcode = 4
            self.setStatus(STATUS_EXCEPTION)
            dprint(obsTaskRe._errmsg)
            return obsTaskRe

        # 1. decide an exeHost to do the local reduction task
        self._progress = 0
        while (self._progress == 0):
            self._progress = 1
            host = None

            # TODO
            # for loop the imgurllist until find one that is online
            # if cannot find, just get random one that is online
            urlError = 0
            for imgurl in imgurlList:
                try:
                    host = urlparse(imgurl)
                    if (not host):
                        urlError = 1
                        continue
                    else:
                        if (self._blackHostList.has_key(
                                '%s:%d' % (host.hostname, host.port))):
                            urlError = 1
                            continue
                        ret = ngamsJobMWALib.pingHost(
                            'http://%s:%d/STATUS' % (host.hostname, host.port))
                        if (ret):
                            urlError = 1
                            continue
                        else:
                            urlError = 0
                            break
                except Exception:
                    urlError = 1
                    continue

            if (urlError):
                #try another random site
                host = ngamsJobMWALib.getNextOnlineHost()
                if (host):
                    self._taskExeHost = host
                else:
                    obsTaskRe._errcode = 1
                    obsTaskRe._errmsg = 'Failed to find any host to execute local job'
                    self.setStatus(STATUS_EXCEPTION)
                    break
            else:
                self._taskExeHost = '%s:%d' % (host.hostname, host.port)

            #  2. construct the local TaskId (jobId__obsNum)
            taskId = '%s__%s' % (self.getParent().getId(), self.getId())
            obsLT = ObsLocalTask(taskId, imgurlList, self.__rtsParam)

            # 3. register the local task
            ngamsJobMWALib.registerLocalTask(taskId, self)

            # 4. - do the real reduction work (i.e. combine all images from correlators into a single one) at a remote node
            strLT = pickle.dumps(obsLT)
            try:
                strRes = urllib2.urlopen('http://%s/RUNTASK' %
                                         self._taskExeHost,
                                         data=strLT,
                                         timeout=15).read()
                logger.debug(
                    'Submit local task, acknowledgement received: %s' % strRes)
            except urllib2.URLError as urlerr:
                if (str(urlerr).find('Connection refused') >
                        -1):  # the host is down
                    #TODO - make it a log!
                    logger.info(
                        'The original host %s is down, changing to another host to download all image files...'
                        % self._taskExeHost)
                    self._progress = 0
                    self._taskExeHost = None
                    self._localTaskResult = None
                    self._blackHostList[self._taskExeHost] = 1
                    continue  # the current host is down, change to another host, and redo file staging
                else:
                    errmsg = 'Fail to schedule obs reduction task on %s: %s' % (
                        self._taskExeHost, str(urlerr))
                    obsTaskRe._errmsg = errmsg
                    obsTaskRe._errcode = 2
                    self.setStatus(STATUS_EXCEPTION)
                    dprint(obsTaskRe._errmsg)
                    break

            self._progress = 4.5
            self.setStatus(STATUS_QUEUEING)
            self._ltDequeueEvent.wait()  # no timeout

            # 5. - wait until result comes back
            self.setStatus(STATUS_RUNNING)
            self._ltComltEvent.wait(self._timeOut4LT)
            if (not self._localTaskResult):
                timeout_retry += 1
                if (timeout_retry > 2):
                    errmsg = 'Timeout when running obs reduction task on %s' % (
                        self._taskExeHost)
                    obsTaskRe._errmsg = errmsg
                    obsTaskRe._errcode = 3
                    self.setStatus(STATUS_EXCEPTION)
                    dprint(obsTaskRe._errmsg)
                    break
                else:
                    logger.info(
                        'The local task %s on node %s has timed out, try another host'
                        % (taskId, self._taskExeHost))
                    self._progress = 0
                    self._taskExeHost = None
                    self._ltComltEvent.clear()
                    self._localTaskResult = None
                    self._blackHostList[self._taskExeHost] = 1
                    continue
            else:
                obsTaskRe._errcode = 0
                obsTaskRe.setImgUrl(self._localTaskResult.getResultURL())

        return obsTaskRe