def recoverFINISHEDDIRTY(s, context, ftsFileName):
  (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName, context=context)
  # This set of files has transferred successfully. Move it to DONE directory
  # print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "."  
  # context = fts3.Context(fServer)
  jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
  failedFiles = []
  missFiles = []
  kor = 0
  for fileInfo in jobStat['files']:
    if fileInfo["file_state"] == "FINISHED": continue
    reason = fileInfo["reason"]
    if "Probably stalled" in reason:
      failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
    elif "globus_ftp_control_local_pasv failed" in reason:
      failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
    elif "500 No such file or directory" in reason:
      print fServer, fileInfo["source_surl"], reason
      print fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID
      missFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
    else:
      kor = kor + 1
      if kor < 2:
        print ftsFileName, fileInfo["source_surl"], fileInfo["reason"][:50]
      failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
  # print failedFiles
  cleanUpTransfer(failedFiles, ftsFileName)
  writeTransfer(failedFiles, "TODO/", "D", ftsFileName)
  writeTransfer(missFiles, "DONE/Bad/", "M", ftsFileName)
Example #2
0
def submit(proxy, toTrans, source, destination):

    # prepare rest job with 200 files per job
    transfers = []
    for files in chunks(toTrans, 200):

        c = pycurl.Curl()
        # create destination and source pfns for job
        for lfn in files:
            print(lfn)
            transfers.append(
                fts3.new_transfer(apply_tfc_to_lfn(source, lfn, c),
                                  apply_tfc_to_lfn(destination, lfn, c)))

        c.close()

        # Submit fts job
        context = fts3.Context('https://fts3.cern.ch:8446',
                               proxy,
                               proxy,
                               verify=True)
        print(fts3.delegate(context, lifetime=timedelta(hours=48),
                            force=False))

        job = fts3.new_job(transfers)

        #print("Monitor link: https://fts3.cern.ch:8449/fts3/ftsmon/#/job/"+fts3.submit(context, job))
        jobid = fts3.submit(context, job)

        #for file in (fts3.get_job_status(context, jobid, list_files=True))["files"]:
        for key, value in (fts3.get_job_status(context, jobid,
                                               list_files=True)).iteritems():
            print key
Example #3
0
def getNewStatus(s, f, fid=""):
    if len(fid) < 3:
        (fid, fstat, fIter, fServer) = getStatusForJob(s, f)
    if fid == "-1":
        print "File ", f, "not submitted to FTS?"
        return "Unknown-notsubmitted", -1, 0, "-1"
    context = fts3.Context(fServer)
    ftsStat = fts3.get_job_status(context, fid)
    return ftsStat["job_state"], fid, ftsStat, fServer
Example #4
0
  def monitorFTS3( self, full = False ):
    if not self.FTSGUID:
      return S_ERROR( "FTSGUID not set, FTS job not submitted?" )

    jobStatusDict = None
    try:
      context = fts3.Context( endpoint = self.FTSServer )
      jobStatusDict = fts3.get_job_status( context, self.FTSGUID, list_files = True )
    except Exception, e:
      return S_ERROR( "Error getting the job status %s" % e )
Example #5
0
    def monitorFTS3(self, full=False):
        if not self.FTSGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        jobStatusDict = None
        try:
            context = fts3.Context(endpoint=self.FTSServer)
            jobStatusDict = fts3.get_job_status(context,
                                                self.FTSGUID,
                                                list_files=True)
        except Exception, e:
            return S_ERROR("Error getting the job status %s" % e)
def getNewStatus(s, f, fid="", context=0):
  if len(fid) < 3 :
    (fid, fstat, fIter, fServer) = getStatusForJob(s, f)
  if fid == "-1":
    return "Unknown-notsubmitted", -1, 0, "-1"
  if context == 0:
    context = fts3.Context(fServer)
  try:
    ftsStat = fts3.get_job_status(context, fid)
    return ftsStat["job_state"], fid, ftsStat, fServer
  except:
    print "File ", f, "unknown to FTS?"
    return "Unknow", -1, 0, "-1"
def _fts_wait_jobs(context, job_map_list, sleep_time=10):
    """
    """
    finished_jobs = []
    while len(finished_jobs) < len(job_map_list):
        for job_map in job_map_list:
            try:
                job_id = job_map['job_id']
                if job_id in finished_jobs:
                    continue
                response = fts3.get_job_status(context, job_id, list_files=True)
                if response['http_status'] == "200 Ok":
                    if response["job_finished"]:
                        finished_jobs.append(job_id)
                        _flush_logging_msg(
                            'Job with id {} finished with job_state:{} | {}/{}'.
                            format(job_id, response['job_state'],
                                   len(finished_jobs), len(job_map_list)))

                        if response['job_state'] == "FINISHED":
                            _gfal_rm_files(job_map['files_to_purge'],
                                           job_map['directory'])
                            _flush_logging_msg(
                                "Removing testing files from destination")
                        else:
                            filenames = []
                            for file_map in response['files']:
                                if file_map['file_state'] == 'FINISHED':
                                    filenames.append(
                                        file_map['dest_surl'].split(
                                            "/dest/")[1])
                            _flush_logging_msg(
                                "Removing testing files from destination")
                            _gfal_rm_files(filenames, job_map['directory'])
                else:
                    _flush_logging_msg('Server http status: {}'.format(
                        response['http_status']))
                    finished_jobs.append(job_id)
            except Exception as e:
                _flush_logging_msg("Polling failed:{}, response:{}".format(
                    e, response))
                finished_jobs.append(job_id)
        _flush_logging_msg(
            "Sleeping for {} seconds before commencing polling again..".format(
                sleep_time))
        time.sleep(sleep_time)
    return None
Example #8
0
def recoverFINISHEDDIRTY(s, ftsFileName):
    (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName)
    # This set of files has transferred successfully. Move it to DONE directory
    print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "."

    context = fts3.Context(fServer)
    jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
    failedFiles = []
    for fileInfo in jobStat['files']:
        if fileInfo["file_state"] == "FINISHED": continue
        failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
    # print failedFiles
    for fF in failedFiles:
        print fF

    cleanUpTransfer(failedFiles, ftsFileName)
    retryFailedTransfer(failedFiles, ftsFileName)
Example #9
0
def recoverFINISHEDDIRTY(s, ftsFileName):
  (status, ftsJID, fStat, fServer) = getNewStatus(s, ftsFileName)
  if status == "Unknown":
    # Try again ...
    shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName)
    return
  # This set of files has transferred successfully. Move it to DONE directory
  print "Finished dirty for ", ftsFileName, " with ftsID", ftsJID, " status", status, "."
  if ftsJID == -1:
    print "Probably in old sqlite dB. Could not check - retry"
    shutil.move(ceBase + "DONE/Dirty/" + ftsFileName, ceBase + "TODO/" + ftsFileName)
    return
  context = fts3.Context(fServer)
  jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
  failedFiles = []
  for fileInfo in jobStat['files']:
    if fileInfo["file_state"] == "FINISHED": continue
    failedFiles.append((fileInfo["source_surl"], fileInfo["dest_surl"]))
  # print failedFiles
  if len(failedFiles) < 1: return
  cleanUpTransfer(failedFiles, ftsFileName)
  retryFailedTransfer(failedFiles, ftsFileName)
    (options, args) = opts.parse_args()

    # get the jobID as the last parameter
    if len(args) < 2 and not options.uniq:
        opts.print_usage()
        sys.exit(1)
    job_id = args[0]

    reasons = []
    if not options.uniq:
        for r in args[1:]:
            reasons.append(convert_reason_to_regexp(r))
    # pprint(reasons)
    context = fts3.Context(options.endpoint)

    job_status = fts3.get_job_status(context, job_id, list_files=True)
    if job_status['job_state'] not in ['FINISHED', 'FINISHEDDIRTY', 'CANCELED', 'FAILED']:
        print "Sorry, job %s has not finished yet, its' status is %s" % (job_id, job_status['job_state'])
        sys.exit(0)

    if options.uniq:
        handle_uniq()
        sys.exit(0)

    if options.invert:
        notTransferedFiles = [(f['source_surl'], f['dest_surl']) for f in job_status['files']
                              if f['file_state'] in ['FAILED', 'CANCELED']
                              and not matches(f['reason'], reasons, options.verbose)]

                              #and sanitize_error(f['reason']) not in reasons]
    else:
Example #11
0
    def monitor(self, context=None, ftsServer=None, ucert=None):
        """ Queries the fts server to monitor the job.
        The internal state of the object is updated depending on the
        monitoring result.

        In case the job is not found on the server, the status is set to 'Failed'

        Within a job, only the transfers having a `fileID` metadata are considered.
        This is to allow for multihop jobs doing a staging

        This method assumes that the attribute self.ftsGUID is set

        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc)

        :returns: {FileID: { status, error } }

                  Possible error numbers

                  * errno.ESRCH: If the job does not exist on the server
                  * errno.EDEADLK: In case the job and file status are inconsistent (see comments inside the code)


    """

        if not self.ftsGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        if not context:
            if not ftsServer:
                ftsServer = self.ftsServer
            context = fts3.Context(endpoint=ftsServer,
                                   ucert=ucert,
                                   request_class=ftsSSLRequest,
                                   verify=False)

        jobStatusDict = None
        try:
            jobStatusDict = fts3.get_job_status(context,
                                                self.ftsGUID,
                                                list_files=True)
        # The job is not found
        # Set its status to Failed and return
        except NotFound:
            self.status = 'Failed'
            return S_ERROR(
                errno.ESRCH,
                "FTSGUID %s not found on %s" % (self.ftsGUID, self.ftsServer))
        except FTS3ClientException as e:
            return S_ERROR("Error getting the job status %s" % e)

        now = datetime.datetime.utcnow().replace(microsecond=0)
        self.lastMonitor = now

        newStatus = jobStatusDict['job_state'].capitalize()
        if newStatus != self.status:
            self.status = newStatus
            self.lastUpdate = now
            self.error = jobStatusDict['reason']

        if newStatus in self.FINAL_STATES:
            self._fillAccountingDict(jobStatusDict)

        filesInfoList = jobStatusDict['files']
        filesStatus = {}
        statusSummary = {}

        # Make a copy, since we are potentially
        # deleting objects
        for fileDict in list(filesInfoList):
            file_state = fileDict['file_state'].capitalize()
            file_metadata = fileDict['file_metadata']

            # previous version of the code did not have dictionary as
            # file_metadata
            if isinstance(file_metadata, dict):
                file_id = file_metadata.get('fileID')
            else:
                file_id = file_metadata

            # The transfer does not have a fileID attached to it
            # so it does not correspond to a file in our DB: skip it
            # (typical of jobs with different staging protocol == CTA)
            # We also remove it from the fileInfoList, such that it is
            # not considered for accounting
            if not file_id:
                filesInfoList.remove(fileDict)
                continue

            file_error = fileDict['reason']
            filesStatus[file_id] = {'status': file_state, 'error': file_error}

            # If the state of the file is final for FTS, set ftsGUID of the file to None,
            # such that it is "released" from this job and not updated anymore in future
            # monitoring calls
            if file_state in FTS3File.FTS_FINAL_STATES:
                filesStatus[file_id]['ftsGUID'] = None

            # If the file is not in a final state, but the job is, we return an error
            # FTS can have inconsistencies where the FTS Job is in a final state
            # but not all the files.
            # The inconsistencies are cleaned every hour on the FTS side.
            # https://its.cern.ch/jira/browse/FTS-1482
            elif self.status in self.FINAL_STATES:
                return S_ERROR(
                    errno.EDEADLK,
                    "Job %s in a final state (%s) while File %s is not (%s)" %
                    (self.ftsGUID, self.status, file_id, file_state))

            statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

        # We've removed all the intermediate transfers that we are not interested in
        # so we put this back into the monitoring data such that the accounting is done properly
        jobStatusDict['files'] = filesInfoList
        if newStatus in self.FINAL_STATES:
            self._fillAccountingDict(jobStatusDict)

        total = len(filesInfoList)
        completed = sum([
            statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES
        ])
        self.completeness = int(100 * completed / total)

        return S_OK(filesStatus)
Example #12
0
    def monitorFTS3(self, full=False):
        if not self.FTSGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        jobStatusDict = None
        try:
            if not self._fts3context:
                self._fts3context = fts3.Context(endpoint=self.FTSServer,
                                                 request_class=ftsSSLRequest,
                                                 verify=False)
            context = self._fts3context
            jobStatusDict = fts3.get_job_status(context,
                                                self.FTSGUID,
                                                list_files=True)
        except Exception as e:
            return S_ERROR("Error getting the job status %s" % e)

        self.Status = jobStatusDict['job_state'].capitalize()

        filesInfoList = jobStatusDict['files']
        statusSummary = {}
        for fileDict in filesInfoList:
            file_state = fileDict['file_state'].capitalize()
            statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

        total = len(filesInfoList)
        completed = sum(
            [statusSummary.get(state, 0) for state in FTSFile.FINAL_STATES])
        self.Completeness = 100 * completed / total

        if not full:
            return S_OK(statusSummary)

        ftsFilesPrinted = False
        for fileDict in filesInfoList:
            sourceURL = fileDict['source_surl']
            targetURL = fileDict['dest_surl']
            fileStatus = fileDict['file_state'].capitalize()
            reason = fileDict['reason']
            duration = fileDict['tx_duration']
            candidateFile = None
            for ftsFile in self:
                if ftsFile.SourceSURL == sourceURL and ftsFile.TargetSURL == targetURL:
                    candidateFile = ftsFile
                    break
            if candidateFile is None:
                self._log.warn(
                    'FTSFile not found',
                    'Source: %s, Target: %s' % (sourceURL, targetURL))
                if not ftsFilesPrinted:
                    ftsFilesPrinted = True
                    if not len(self):
                        self._log.warn('Monitored FTS job is empty!')
                    else:
                        self._log.warn(
                            'All FTS files are:', '\n' + '\n'.join([
                                'Source: %s, Target: %s' %
                                (ftsFile.SourceSURL, ftsFile.TargetSURL)
                                for ftsFile in self
                            ]))
            else:
                candidateFile.Status = fileStatus
                candidateFile.Error = reason
                candidateFile._duration = duration

                if candidateFile.Status == "Failed":
                    for missingSource in self.missingSourceErrors:
                        if missingSource.match(reason):
                            candidateFile.Error = "MissingSource"

        # # register successful files
        if self.Status in FTSJob.FINALSTATES:
            return self.finalize()
        return S_OK()
Example #13
0
  def monitor(self, context=None, ftsServer=None, ucert=None):
    """ Queries the fts server to monitor the job

        This method assumes that the attribute self.ftsGUID is set

        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc)

        :returns {FileID: { status, error } }
    """

    if not self.ftsGUID:
      return S_ERROR("FTSGUID not set, FTS job not submitted?")

    if not context:
      if not ftsServer:
        ftsServer = self.ftsServer
      context = fts3.Context(
          endpoint=ftsServer,
          ucert=ucert,
          request_class=ftsSSLRequest,
          verify=False)

    jobStatusDict = None
    try:
      jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True)
    except FTS3ClientException as e:
      return S_ERROR("Error getting the job status %s" % e)

    now = datetime.datetime.utcnow().replace(microsecond=0)
    self.lastMonitor = now

    newStatus = jobStatusDict['job_state'].capitalize()
    if newStatus != self.status:
      self.status = newStatus
      self.lastUpdate = now
      self.error = jobStatusDict['reason']

    if newStatus in self.FINAL_STATES:
      self._fillAccountingDict(jobStatusDict)

    filesInfoList = jobStatusDict['files']
    filesStatus = {}
    statusSummary = {}

    for fileDict in filesInfoList:
      file_state = fileDict['file_state'].capitalize()
      file_id = fileDict['file_metadata']
      file_error = fileDict['reason']
      filesStatus[file_id] = {'status': file_state, 'error': file_error}

      # If the state of the file is final for FTS, set ftsGUID of the file to None,
      # such that it is "released" from this job and not updated anymore in future
      # monitoring calls
      if file_state in FTS3File.FTS_FINAL_STATES:
        filesStatus[file_id]['ftsGUID'] = None

      statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

    total = len(filesInfoList)
    completed = sum([statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES])
    self.completeness = 100 * completed / total

    return S_OK(filesStatus)
Example #14
0
#   You may obtain a copy of the License at
# 
#       http://www.apache.org/licenses/LICENSE-2.0
# 
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import json
import logging
import fts3.rest.client.easy as fts3
from optparse import OptionParser


opts = OptionParser()
opts.add_option('-s', '--endpoint', dest='endpoint', default='https://fts3-pilot.cern.ch:8446')
opts.add_option('-l', '--list', dest='list_files', default=False, action='store_true')

(options, args) = opts.parse_args()
if len(args) < 1:
    raise Exception('Need a job id')
job_id = args[0]

logging.getLogger('fts3.rest.client').setLevel(logging.DEBUG)

context = fts3.Context(options.endpoint)
job_status = fts3.get_job_status(context, job_id, list_files=options.list_files)
print json.dumps(job_status, indent=2)
Example #15
0
  def monitor(self, context=None, ftsServer=None, ucert=None):
    """ Queries the fts server to monitor the job

        This method assumes that the attribute self.ftsGUID is set

        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc)

        :returns {FileID: { status, error } }
    """

    if not self.ftsGUID:
      return S_ERROR("FTSGUID not set, FTS job not submitted?")

    if not context:
      if not ftsServer:
        ftsServer = self.ftsServer
      context = fts3.Context(
          endpoint=ftsServer,
          ucert=ucert,
          request_class=ftsSSLRequest,
          verify=False)

    jobStatusDict = None
    try:
      jobStatusDict = fts3.get_job_status(context, self.ftsGUID, list_files=True)
    except FTS3ClientException as e:
      return S_ERROR("Error getting the job status %s" % e)

    now = datetime.datetime.utcnow().replace(microsecond=0)
    self.lastMonitor = now

    newStatus = jobStatusDict['job_state'].capitalize()
    if newStatus != self.status:
      self.status = newStatus
      self.lastUpdate = now
      self.error = jobStatusDict['reason']

    if newStatus in self.FINAL_STATES:
      self._fillAccountingDict(jobStatusDict)

    filesInfoList = jobStatusDict['files']
    filesStatus = {}
    statusSummary = {}

    for fileDict in filesInfoList:
      file_state = fileDict['file_state'].capitalize()
      file_id = fileDict['file_metadata']
      file_error = fileDict['reason']
      filesStatus[file_id] = {'status': file_state, 'error': file_error}

      # If the state of the file is final for FTS, set ftsGUID of the file to None,
      # such that it is "released" from this job and not updated anymore in future
      # monitoring calls
      if file_state in FTS3File.FTS_FINAL_STATES:
        filesStatus[file_id]['ftsGUID'] = None

      # If the file is not in a final state, but the job is, we return an error
      # FTS can have inconsistencies where the FTS Job is in a final state
      # but not all the files.
      # The inconsistencies are cleaned every hour on the FTS side.
      # https://its.cern.ch/jira/browse/FTS-1482
      elif self.status in self.FINAL_STATES:
        return S_ERROR(errno.EDEADLK, "Job %s in a final state (%s) while File %s is not (%s)" %
                       (self.ftsGUID, self.status, file_id, file_state))

      statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

    total = len(filesInfoList)
    completed = sum([statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES])
    self.completeness = 100 * completed / total

    return S_OK(filesStatus)
Example #16
0
    def killThread(self, thread_id, transfers):
        """This is the worker thread function for kill command.
        """
        while True:
            transfer_list = transfers.get()
            self.logger.info("Starting thread %s" % (thread_id))
            user = transfer_list[0]['username']
            group = transfer_list[0]['user_group']
            role = transfer_list[0]['user_role']

            uiSetupScript = getattr(self.config, 'UISetupScript', None)

            self.logger.debug("Trying to get DN for %s %s %s %s" % (user, self.logger, self.config.opsProxy, self.config.opsProxy))
            try:
                userDN = getDNFromUserName(user, self.logger, ckey=self.config.opsProxy, cert=self.config.opsProxy)
            except Exception as ex:
                msg = "Error retrieving the user DN"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                continue
            if not userDN:
                transfers.task_done()
                time.sleep(1)
                continue
            self.logger.debug("user DN: %s" % userDN)

            try:
                defaultDelegation = {'logger': self.logger,
                                     'credServerPath': self.config.credentialDir,
                                     'myProxySvr': 'myproxy.cern.ch',
                                     'min_time_left': getattr(self.config, 'minTimeLeft', 36000),
                                     'serverDN': self.config.serverDN,
                                     'uisource': uiSetupScript,
                                     'cleanEnvironment': getattr(self.config, 'cleanEnvironment', False)}
                if hasattr(self.config, "cache_area"):
                    cache_area = self.config.cache_area
                    defaultDelegation['myproxyAccount'] = re.compile('https?://([^/]*)/.*').findall(cache_area)[0]
            except IndexError:
                self.logger.error('MyproxyAccount parameter cannot be retrieved from %s . ' % self.config.cache_area)
                transfers.task_done()
                time.sleep(1)
                continue
            if getattr(self.config, 'serviceCert', None):
                defaultDelegation['server_cert'] = self.config.serviceCert
            if getattr(self.config, 'serviceKey', None):
                defaultDelegation['server_key'] = self.config.serviceKey
            try:
                defaultDelegation['userDN'] = userDN
                defaultDelegation['group'] = group if group else ''
                defaultDelegation['role'] = role if group else ''
                self.logger.debug('delegation: %s' % defaultDelegation)
                valid_proxy, user_proxy = getProxy(defaultDelegation, self.logger)
            except Exception as ex:
                msg = "Error getting the user proxy"
                msg += str(ex)
                msg += str(traceback.format_exc())
                self.logger.error(msg)
                transfers.task_done()
                time.sleep(1)
                continue

            # TODO: take server from db, right now, take only the first of the list and assuming it valid for all
            try:
                # TODO: debug u added during info upload. To be fixed soon! For now worked around
                fts_server = transfer_list[0]['fts_instance'].split('u')[1]
                self.logger.info("Delegating proxy to %s" % fts_server)
                context = fts3.Context(fts_server, user_proxy, user_proxy, verify=True)
                self.logger.debug(fts3.delegate(context, lifetime=timedelta(hours=48), force=False))

                self.logger.info("Proxy delegated. Grouping files by jobId")
                jobs = {}
                for fileToKill in transfer_list:
                    # TODO: debug u added during info upload. To be fixed soon! For now worked around
                    jid = str(fileToKill['fts_id']).split('u')[1]
                    if jid not in jobs:
                        jobs[jid] = []
                    jobs[jid].append(fileToKill)

                self.logger.info("Found %s jobIds", len(jobs.keys()))
                self.logger.debug("jobIds: %s", jobs.keys)

                # list for files killed or failed to
                killed = []
                too_late = []

                for ftsJobId, files in jobs.iteritems():
                    self.logger.info("Cancelling tranfers in %s" % ftsJobId)

                    ref_lfns = [str(x['destination_lfn'].split('/store/')[1]) for x in files]
                    source_lfns = [x['source_lfn'] for x in files]

                    job_list = fts3.get_job_status(context, ftsJobId, list_files=True)
                    tx = job_list['files']

                    # TODO: this workaround is needed to get FTS file id, we may want to add a column in the db?
                    idListToKill = [x['file_id'] for x in tx
                                    if x['dest_surl'].split('/cms/store/')[1] in ref_lfns]

                    # needed for the state update
                    lfnListToKill = [ref_lfns.index(str(x['dest_surl'].split('/cms/store/')[1])) for x in tx
                                       if x['dest_surl'].split('/cms/store/')[1] in ref_lfns]

                    self.logger.debug("List of ids to cancel for job %s: %s" % (ftsJobId, idListToKill))
                    res = fts3.cancel(context, ftsJobId, idListToKill)
                    self.logger.debug('Kill command result: %s' % json.dumps(res))

                    if not isinstance(res, list):
                        res = [res]

                    # Verify if the kill command succeeded
                    for k, kill_res in enumerate(res):
                        indexToUpdate = lfnListToKill[k]
                        if kill_res in ("FINISHEDDIRTY", "FINISHED", "FAILED"):
                            self.logger.debug(source_lfns[indexToUpdate])
                            too_late.append(getHashLfn(source_lfns[indexToUpdate]))
                        else:
                            killed.append(getHashLfn(source_lfns[indexToUpdate]))

                # TODO: decide how to update status for too_late files
                killed += too_late
                self.logger.debug('Updating status of killed files: %s' % killed)

                if len(killed) > 0:
                    data = dict()
                    data['asoworker'] = self.config.asoworker
                    data['subresource'] = 'updateTransfers'
                    data['list_of_ids'] = killed
                    data['list_of_transfer_state'] = ["KILLED" for _ in killed]
                    self.oracleDB.post(self.config.oracleFileTrans,
                                       data=encodeRequest(data))
                    self.logger.debug("Marked killed %s" % killed)
            except:
                # TODO: split and improve try/except
                self.logger.exception('Kill command failed')

            transfers.task_done()
Example #17
0
    def worker(self, i, input):
        """
        - get a token for fts
        - loop over users in queue
        - for each user get the list of jobid from filenames in Monitor/user folder
        - monitor the status of the job
        - if final, look the file statuses of the files
        - update the db state
        - remove file from the source (raise no critical error)

        :param i: id number of the thread
        :param inputs: users
        :return:
        """
        if not self.config.TEST:
            context = fts3.Context(self.config_getter.serverFTS,
                                   self.config_getter.opsProxy,
                                   self.config_getter.opsProxy,
                                   verify=True)

        logger = self.logger  # setProcessLogger('Mon'+str(i))
        logger.info("Process %s is starting. PID %s", i, os.getpid())
        Update = update(logger, self.config_getter)

        while not self.STOP:
            if input.empty():
                time.sleep(10)
                continue
            try:
                user = input.get()
            except (EOFError, IOError):
                crashMessage = "Hit EOF/IO in getting new work\n"
                crashMessage += "Assuming this is a graceful break attempt.\n"
                logger.error(crashMessage)
                break

            for File in os.listdir('Monitor/' + user):
                job = File.split('.')[0]
                try:
                    if not self.config.TEST:
                        results = fts3.get_job_status(context,
                                                      job,
                                                      list_files=False)
                        self.logger.info('Getting status for job: ' + job +
                                         ' ' + results['job_state'])
                    else:
                        time.sleep(random.randint(0, random.randint(0, 3)))
                        lf = json.loads(
                            open('Monitor/' + user + '/' + File).read())
                        if random.randint(0, random.randint(0, 5)) == 0:
                            results = {
                                'job_state':
                                'FINISHED',
                                'files': [{
                                    'file_metadata': {
                                        'lfn': x
                                    },
                                    'file_state': 'FINISHED'
                                } for x in lf]
                            }
                        else:
                            results = {'job_state': 'SUBMITTED'}
                        self.logger.info('Getting status for job: ' + job +
                                         ' ' + results['job_state'])
                except Exception:
                    logger.exception('Failed get job status for %s' % job)
                    continue

                if results['job_state'] in [
                        'FINISHED', 'FAILED', 'FINISHEDDIRTY', 'CANCELED'
                ]:
                    if not self.config.TEST:
                        try:
                            results = fts3.get_job_status(context,
                                                          job,
                                                          list_files=True)
                        except Exception:
                            logger.exception(
                                'Failed get file statuses for %s' % job)
                            continue

                    self.logger.info('Updating status for job: ' + job)
                    failed_lfn = list()
                    failed_reasons = list()
                    done_lfn = list()
                    for Fl in results['files']:
                        lfn = Fl['file_metadata']['lfn']
                        if Fl['file_state'] == 'FINISHED':
                            done_lfn.append(lfn)
                        else:
                            failed_lfn.append(lfn)
                            if Fl['reason'] is not None:
                                self.logger.warning('Failure reason: ' +
                                                    Fl['reason'])
                                failed_reasons.append(Fl['reason'])
                            else:
                                self.logger.exception(
                                    'Failure reason not found')
                                failed_reasons.append(
                                    'unable to get failure reason')

                    try:
                        logger.info(
                            'Marking job %s files done and %s files  failed for job %s'
                            % (len(done_lfn), len(failed_lfn), job))
                        doneReady = Update.transferred(done_lfn)
                        failedReady = Update.failed(failed_lfn, failed_reasons)
                    except Exception:
                        logger.exception('Failed to update states')
                        continue

                    if doneReady == 1 or failedReady == 1:
                        continue

                    try:
                        logger.info('Removing' + 'Monitor/' + user + '/' +
                                    File)
                        os.rename('Monitor/' + user + '/' + File,
                                  'Done/' + File)
                    except Exception:
                        logger.exception('failed to remove monitor file')
                        continue
            input.task_done()
            self.active_users.remove(user)
            time.sleep(1)
        logger.debug("Worker %s exiting.", i)
Example #18
0
    def monitor(self, context=None, ftsServer=None, ucert=None):
        """ Queries the fts server to monitor the job

        This method assumes that the attribute self.ftsGUID is set

        :param context: fts3 context. If not given, it is created (see ftsServer & ucert param)
        :param ftsServer: the address of the fts server to submit to. Used only if context is
                          not given. if not given either, use the ftsServer object attribute

        :param ucert: path to the user certificate/proxy. Might be infered by the fts cli (see its doc)

        :returns {FileID: { status, error } }
    """

        if not self.ftsGUID:
            return S_ERROR("FTSGUID not set, FTS job not submitted?")

        if not context:
            if not ftsServer:
                ftsServer = self.ftsServer
            context = fts3.Context(endpoint=ftsServer,
                                   ucert=ucert,
                                   request_class=ftsSSLRequest,
                                   verify=False)

        jobStatusDict = None
        try:
            jobStatusDict = fts3.get_job_status(context,
                                                self.ftsGUID,
                                                list_files=True)
        except FTS3ClientException as e:
            return S_ERROR("Error getting the job status %s" % e)

        now = datetime.datetime.utcnow().replace(microsecond=0)
        self.lastMonitor = now

        newStatus = jobStatusDict['job_state'].capitalize()
        if newStatus != self.status:
            self.status = newStatus
            self.lastUpdate = now
            self.error = jobStatusDict['reason']

        if newStatus in self.FINAL_STATES:
            self._fillAccountingDict(jobStatusDict)

        filesInfoList = jobStatusDict['files']
        filesStatus = {}
        statusSummary = {}

        for fileDict in filesInfoList:
            file_state = fileDict['file_state'].capitalize()
            file_id = fileDict['file_metadata']
            file_error = fileDict['reason']
            filesStatus[file_id] = {'status': file_state, 'error': file_error}

            statusSummary[file_state] = statusSummary.get(file_state, 0) + 1

        total = len(filesInfoList)
        completed = sum([
            statusSummary.get(state, 0) for state in FTS3File.FTS_FINAL_STATES
        ])
        self.completeness = 100 * completed / total

        return S_OK(filesStatus)
Example #19
0
                            const=-1,
                            default=0,
                            help='Do not resubmit any failed transfers')
    opts.add_option_group(loop_options)
    (options, args) = opts.parse_args()

    if len(args) < 1:
        opts.print_usage()
        sys.exit(1)
    job_id = args[0]

    context = fts3.Context(options.endpoint)
    if options.reg_endpoint:
        reg_context = fts3.Context(options.reg_endpoint)

    job_status = fts3.get_job_status(context, job_id, list_files=True)
    if job_status['job_state'] not in [
            'FINISHED', 'FINISHEDDIRTY', 'CANCELED', 'FAILED'
    ]:
        print "Sorry, job %s has not finished yet, its' status is %s" % (
            job_id, job_status['job_state'])
        sys.exit(0)

    if job_status['job_state'] != 'FINISHED':
        print "The job had problems, its' status is %s" % job_status[
            'job_state']

    transferedFiles = [
        f['dest_surl'] for f in job_status['files']
        if f['file_state'] in ['FINISHED'] or f['reason'] ==
        'DESTINATION file already exists and overwrite is not enabled'
Example #20
0
def check_FTSJob(logger, ftsContext, jobid, jobsEnded, jobs_ongoing, done_id,
                 failed_id, failed_reasons):
    """
    get transfers state per jobid

    INPUT PARAMS
    :param logger: a logging object
    :param ftsContext:
    :param jobid:
    OUTPUT PARAMS
    :prarm jobsEnded:
    :param jobs_ongoing:
    :param done_id:
    :param failed_id:
    :param failed_reasons:
    - check if the fts job is in final state (FINISHED, FINISHEDDIRTY, CANCELED, FAILED)
    - get file transfers states and get corresponding oracle ID from FTS file metadata
    - update states on oracle
    """

    logger.info("Getting state of job %s" % jobid)

    jobs_ongoing.append(jobid)

    try:
        status = fts3.get_job_status(ftsContext, jobid, list_files=False)
    except HTTPException as hte:
        logger.exception("failed to retrieve status for %s " % jobid)
        logger.exception("httpExeption headers %s " % hte.headers)
        if hte.status == 404:
            logger.exception("%s not found in FTS3 DB" % jobid)
            jobs_ongoing.remove(jobid)
        return
    except Exception:
        logger.exception("failed to retrieve status for %s " % jobid)
        return

    logger.info("State of job %s: %s" % (jobid, status["job_state"]))

    if status["job_state"] in [
            'FINISHED', 'FINISHEDDIRTY', "FAILED", "CANCELED"
    ]:
        jobsEnded.append(jobid)
    if status["job_state"] in [
            'ACTIVE', 'FINISHED', 'FINISHEDDIRTY', "FAILED", "CANCELED"
    ]:
        file_statuses = fts3.get_job_status(ftsContext, jobid,
                                            list_files=True)['files']
        done_id[jobid] = []
        failed_id[jobid] = []
        failed_reasons[jobid] = []
        files_to_remove = []
        fileIds_to_remove = []

        # get the job content from local file
        jobContentFileName = 'task_process/transfers/' + jobid + '.json'
        with open(jobContentFileName, 'r') as fp:
            fileIds = json.load(fp)

        for file_status in file_statuses:
            _id = file_status['file_metadata']['oracleId']
            if not _id in fileIds:
                # this file xfer has been handled already in a previous iteration
                # nothing to do
                continue

            tx_state = file_status['file_state']

            # xfers have only 3 terminal states: FINISHED, FAILED, and CANCELED see
            # https://fts3-docs.web.cern.ch/fts3-docs/docs/state_machine.html
            if tx_state == 'FINISHED':
                done_id[jobid].append(_id)
                files_to_remove.append(file_status['source_surl'])
                fileIds_to_remove.append(_id)
            elif tx_state == 'FAILED' or tx_state == 'CANCELED':
                failed_id[jobid].append(_id)
                if file_status['reason']:
                    logger.info('Failure reason: ' + file_status['reason'])
                    failed_reasons[jobid].append(file_status['reason'])
                else:
                    logger.exception('Failure reason not found')
                    failed_reasons[jobid].append(
                        'unable to get failure reason')
                files_to_remove.append(file_status['source_surl'])
                fileIds_to_remove.append(_id)
            else:
                # file transfer is not terminal:
                if status["job_state"] == 'ACTIVE':
                    # if job is still ACTIVE file status will be updated in future run. See:
                    # https://fts3-docs.web.cern.ch/fts3-docs/docs/state_machine.html
                    pass
                else:
                    # job status is terminal but file xfer status is not.
                    # something went wrong inside FTS and a stuck transfers is waiting to be
                    # removed by the reapStalledTransfers https://its.cern.ch/jira/browse/FTS-1714
                    # mark as failed
                    failed_id[jobid].append(_id)
                    logger.info('Failure reason: stuck inside FTS')
                    failed_reasons[jobid].append(file_status['reason'])
        if files_to_remove:
            list_of_surls = ''  # gfal commands take list of SURL as a list of blank-separated strings
            for f in files_to_remove:
                list_of_surls += str(
                    f) + ' '  # convert JSON u'srm://....' to plain srm://...
            removeLogFile = './task_process/transfers/remove_files.log'
            remove_files_in_bkg(list_of_surls, removeLogFile)
            # remove those file Id's from the list and update the json disk file
            fileIds = list(set(fileIds) - set(fileIds_to_remove))
            jobContentTmp = jobContentFileName + '.tmp'
            with open(jobContentTmp, 'w') as fp:
                json.dump(fileIds, fp)
            os.rename(jobContentTmp, jobContentFileName)
Example #21
0
def lookAtFile(s, fN):
    tFN = fN.split("/")[-1]
    (ftsJID, stat, fIter, fServer) = getStatusForJob(s, tFN)
    if fServer == "-1":
        print "Unknown FTS job? Retry.", tFN
        shutil.move(fN, ceBase + "TODO/" + tFN)
        # if tFN.startswith("M"):
        #  print "Unknown FTS job? Retry.", tFN
        #  shutil.move(fN, ceBase + "TODO/" + tFN)
        # else:
        #  print "Unknown FTS job?", tFN
        return 0
    context = fts3.Context(fServer)
    try:
        jobStat = fts3.get_job_status(context, ftsJID, list_files=True)
        for fileInfo in jobStat['files']:
            # print(tfn,)
            reason = fileInfo["reason"]
            if "No such file or directory" in reason:
                # print tFN, fServer, reason
                #         if "cern.ch" in fServer:
                #           print "Failed for CERN FTS server : retry"
                # #          shutil.move(fN, ceBase + "TODO/" + tFN)
                #           return 0
                if "SOURCE" in reason:
                    print "Missing source : ", fileInfo["source_surl"].split(
                        "SFN="
                    )[1], fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID

            elif "TRANSFER CHECKSUM MISMATCH" in reason:
                print "Transfer checksum mismatch - retrying ", tFN
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            elif "Probably stalled" in reason:
                print "Stalled transfer - retrying ", tFN
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            elif "SOURCE SRM_GET_TURL error on the turl" in reason:
                print "srm failure : probably diskserver was down. Retry"
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            elif "Communication error on send" in reason:
                print "srm failure : Known (old) problem with RAL FTS system. Retry"
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            elif "Transfer canceled because the gsiftp performance marker timeout" in reason:
                print "Recoverable error : 6 minute timeout exceeded. Retry"
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            elif "bad data was encountered" in reason:
                print "Recoverable error : Command failed. : bad data was encountered. Retry"
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            elif "Command failed : error: commands denied" in reason:
                print "Recoverable error : Command failed : error: commands denied. Retry"
                shutil.move(fN, ceBase + "TODO/" + tFN)
                return 0
            else:
                print tFN, fServer[:
                                   -1] + "9/fts3/ftsmon/#/job/" + ftsJID, fileInfo[
                                       "reason"]
                print fileInfo["source_surl"].split("SFN=")[1]
                continue
        # print " .......... "
        # print fServer[:-1] + "9/fts3/ftsmon/#/job/" + ftsJID
        # print " .......... "
    except:
        print "Could not find any information for ", tFN, ". Try the transfer again."
        shutil.move(fN, ceBase + "TODO/B" + tFN)
        return -1
    # print jobStat
    return 0
Example #22
0
  def monitorFTS3( self, full = False ):
    if not self.FTSGUID:
      return S_ERROR( "FTSGUID not set, FTS job not submitted?" )

    jobStatusDict = None
    try:
      if not self._fts3context:
        self._fts3context = fts3.Context( endpoint = self.FTSServer, request_class = ftsSSLRequest, verify = False )
      context = self._fts3context
      jobStatusDict = fts3.get_job_status( context, self.FTSGUID, list_files = True )
    except Exception as e:
      return S_ERROR( "Error getting the job status %s" % e )

    self.Status = jobStatusDict['job_state'].capitalize()

    filesInfoList = jobStatusDict['files']
    statusSummary = {}
    for fileDict in filesInfoList:
      file_state = fileDict['file_state'].capitalize()
      statusSummary[file_state] = statusSummary.get( file_state, 0 ) + 1

    total = len( filesInfoList )
    completed = sum( [ statusSummary.get( state, 0 ) for state in FTSFile.FINAL_STATES ] )
    self.Completeness = 100 * completed / total

    if not full:
      return S_OK( statusSummary )

    ftsFilesPrinted = False
    for fileDict in filesInfoList:
      sourceURL = fileDict['source_surl']
      targetURL = fileDict['dest_surl']
      fileStatus = fileDict['file_state'].capitalize()
      reason = fileDict['reason']
      duration = fileDict['tx_duration']
      candidateFile = None
      for ftsFile in self:
        if ftsFile.SourceSURL == sourceURL and ftsFile.TargetSURL == targetURL :
          candidateFile = ftsFile
          break
      if candidateFile is None:
        self._log.warn( 'FTSFile not found', 'Source: %s, Target: %s' % ( sourceURL, targetURL ) )
        if not ftsFilesPrinted:
          ftsFilesPrinted = True
          if not len( self ):
            self._log.warn( 'Monitored FTS job is empty!' )
          else:
            self._log.warn( 'All FTS files are:', '\n' + '\n'.join( ['Source: %s, Target: %s' % ( ftsFile.SourceSURL, ftsFile.TargetSURL ) for ftsFile in self] ) )
      else:
        candidateFile.Status = fileStatus
        candidateFile.Error = reason
        candidateFile._duration = duration

        if candidateFile.Status == "Failed":
          for missingSource in self.missingSourceErrors:
            if missingSource.match( reason ):
              candidateFile.Error = "MissingSource"

    # # register successful files
    if self.Status in FTSJob.FINALSTATES:
      return self.finalize()
    return S_OK()
Example #23
0
import fts3.rest.client.easy as fts3
from optparse import OptionParser

opts = OptionParser()
opts.add_option('-s',
                '--endpoint',
                dest='endpoint',
                default='https://fts3-pilot.cern.ch:8446')
opts.add_option('-l',
                '--list',
                dest='list_files',
                default=False,
                action='store_true')

(options, args) = opts.parse_args()
if len(args) < 1:
    raise Exception('Need a job id')
job_id = args[0]

logging.getLogger('fts3.rest.client').setLevel(logging.DEBUG)

context = fts3.Context(options.endpoint)
job_status = fts3.get_job_status(context,
                                 job_id,
                                 list_files=options.list_files)
print json.dumps(job_status, indent=2)

jobs_statuses = fts3.get_jobs_statuses(context, [job_id],
                                       list_files=options.list_files)
print json.dumps(job_status, indent=2)