Ejemplo n.º 1
0
def submit_transfers(transfers, transfertool='fts3', job_metadata={}, session=None):
    """
    Submit transfer request to a transfertool.

    :param transfers: Dictionary containing request metadata.
    :param transfertool: Transfertool as a string.
    :param job_metadata: Metadata key/value pairs for all files as a dictionary.
    :param session: Database session to use.
    :returns: Transfertool external ID.
    """

    record_counter('core.request.submit_transfer')

    transfer_id = None

    if transfertool == 'fts3':
        ts = time.time()
        transfer_ids = fts3.submit_transfers(transfers, job_metadata)
        record_timer('core.request.submit_transfers_fts3', (time.time() - ts) * 1000)

    for transfer_id in transfer_ids:
        session.query(models.Request)\
               .filter_by(id=transfer_id)\
               .update({'state': RequestState.SUBMITTED,
                        'external_id': transfer_ids[transfer_id]['external_id'],
                        'external_host': transfer_ids[transfer_id]['external_host'],
                        'dest_url': transfer_ids[transfer_id]['dest_urls'][0]},
                       synchronize_session=False)

    return transfer_ids
Ejemplo n.º 2
0
def query_latest(external_host, state, last_nhours=1):
    """
    Query the latest transfers in last n hours with state.

    :param external_host:  FTS host name as a string.
    :param state:          FTS job state as a string or a dictionary.
    :param last_nhours:    Latest n hours as an integer.
    :returns:              Requests status information as a dictionary.
    """

    record_counter('core.request.query_latest')

    start_time = time.time()
    resps = FTS3Transfertool(external_host=external_host).query_latest(state=state, last_nhours=last_nhours)
    record_timer('core.request.query_latest_fts3.%s.%s_hours' % (external_host, last_nhours), (time.time() - start_time) * 1000)

    if not resps:
        return

    ret_resps = []
    for resp in resps:
        if 'job_metadata' not in resp or resp['job_metadata'] is None or 'issuer' not in resp['job_metadata'] or resp['job_metadata']['issuer'] != 'rucio':
            continue

        if 'request_id' not in resp['job_metadata']:
            # submitted by new submitter
            try:
                logging.debug("Transfer %s on %s is %s, decrease its updated_at." % (resp['job_id'], external_host, resp['job_state']))
                set_transfer_update_time(external_host, resp['job_id'], datetime.datetime.utcnow() - datetime.timedelta(hours=24))
            except Exception as error:
                logging.debug("Exception happened when updating transfer updatetime: %s" % str(error).replace('\n', ''))

    return ret_resps
Ejemplo n.º 3
0
def query_request_details(request_id, transfertool='fts3', session=None):
    """
    Query the detailed status of a request. Can also be done after the
    external transfer has finished.

    :param request_id: Request-ID as a 32 character hex string.
    :param transfertool: Transfertool name as a string.
    :param session: Database session to use.
    :returns: Detailed request status information as a dictionary.
    """

    record_counter('core.request.query_request_details')

    req = get_request(request_id, session=session)

    if not req:
        return

    if transfertool == 'fts3':
        ts = time.time()
        tmp = fts3.query_details(req['external_id'], req['external_host'])
        record_timer('core.request.query_details_fts3', (time.time() - ts) * 1000)
        return tmp

    raise NotImplementedError
Ejemplo n.º 4
0
def query(tid, session):
    """
    Query the transfer job information of a single job. Has a chance to progress the job from QUEUED to either DONE or FAILED.

    :param tid: The transfer job id.
    :returns: The transfer job information.
    """

    record_counter('daemons.mock.fts3.query')

    ts = time.time()
    new_state = random.sample(sum([[FTSState.FINISHED]*15, [FTSState.FAILED]*3, [FTSState.FINISHEDDIRTY]*2, [FTSState.ACTIVE]*80], []), 1)[0]
    record_timer('daemons.mock.fts3.query.000-random_sample', (time.time()-ts)*1000)

    ts = time.time()

    query = session.query(test_models.MockFTSTransfer).filter(and_(test_models.MockFTSTransfer.transfer_id == tid,
                                                                   or_(test_models.MockFTSTransfer.state == FTSState.SUBMITTED,
                                                                       test_models.MockFTSTransfer.state == FTSState.ACTIVE)))

    if query.update({'state': new_state,
                     'last_modified': datetime.datetime.utcnow()}) == 0:
        return None

    r = {'job_state': str(new_state)}
    if new_state == FTSState.FAILED or new_state == FTSState.FINISHEDDIRTY:
        r['reason'] = 'Mock FTS decided to kill your transfer.'
        r['files'] = [{'source_surl': 'mock_src', 'dest_surl': 'mock_dest', 'reason': 'mock failure'}]

    return r
Ejemplo n.º 5
0
    def __update_atime(self):
        """
        Bulk update atime.
        """
        replicas = []
        for report in self.__reports:
            try:
                # check if scope in report. if not skip this one.
                if 'scope' not in report:
                    record_counter('daemons.tracer.kronos.missing_scope')
                    continue
                else:
                    record_counter('daemons.tracer.kronos.with_scope')

                # for the moment only report with eventType get* are handled.
                if not report['eventType'].startswith('get'):
                    continue
                record_counter('daemons.tracer.kronos.total_get')
                if report['eventType'] == 'get':
                    record_counter('daemons.tracer.kronos.dq2clients')
                elif report['eventType'] == 'get_sm':
                    record_counter('daemons.tracer.kronos.panda_production')
                elif report['eventType'] == 'get_sm_a':
                    record_counter('daemons.tracer.kronos.panda_analysis')
                else:
                    record_counter('daemons.tracer.kronos.other_get')

                # check if the report has the right state.
                if report['clientState'] in self.__excluded_states:
                    continue

                if report['usrdn'] in self.__excluded_usrdns:
                    continue
                if not report['remoteSite']:
                    continue
                replicas.append({'name': report['filename'], 'scope': report['scope'], 'rse': report['remoteSite'], 'accessed_at': datetime.utcfromtimestamp(report['traceTimeentryUnix'])})
            except (KeyError, AttributeError):
                logging.error(format_exc())
                record_counter('daemons.tracer.kronos.report_error')
                continue

            for did in list_parent_dids(report['scope'], report['filename']):
                if did['type'] != DIDType.DATASET:
                    continue
                # do not update _dis datasets
                if did['scope'] == 'panda' and '_dis' in did['name']:
                    continue
                self.__dataset_queue.put({'scope': did['scope'], 'name': did['name'], 'did_type': did['type'], 'rse': report['remoteSite'], 'accessed_at': datetime.utcfromtimestamp(report['traceTimeentryUnix'])})

        logging.info(replicas)

        try:
            ts = time()
            touch_replicas(replicas)
            record_timer('daemons.tracer.kronos.update_atime', (time() - ts) * 1000)
        except:
            logging.error(format_exc())
            record_counter('daemons.tracer.kronos.update_error')

        logging.info('(kronos_file) updated %d replicas' % len(replicas))
Ejemplo n.º 6
0
def poller_latest(external_hosts, once=False, last_nhours=1, fts_wait=1800):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()

    logging.info('polling latest %s hours on hosts: %s' % (last_nhours, external_hosts))
    if external_hosts:
        if type(external_hosts) == str:
            external_hosts = [external_hosts]

    while not graceful_stop.is_set():

        try:
            heartbeat.live(executable, hostname, pid, hb_thread)

            start_time = time.time()
            for external_host in external_hosts:
                logging.debug('polling latest %s hours on host: %s' % (last_nhours, external_host))
                ts = time.time()
                resps = None
                state = [str(FTSState.FINISHED), str(FTSState.FAILED), str(FTSState.FINISHEDDIRTY), str(FTSState.CANCELED)]
                try:
                    resps = transfer.query_latest(external_host, state=state, last_nhours=last_nhours)
                except:
                    logging.error(traceback.format_exc())
                record_timer('daemons.conveyor.poller_latest.000-query_latest', (time.time() - ts) * 1000)

                if resps:
                    logging.info('poller_latest - polling %i requests' % (len(resps)))

                if not resps or resps == []:
                    if once:
                        break
                    logging.info("no requests found. will sleep 60 seconds")
                    continue

                for resp in resps:
                    try:
                        ret = request.update_request_state(resp)
                        # if True, really update request content; if False, only touch request
                        record_counter('daemons.conveyor.poller_latest.update_request_state.%s' % ret)
                    except:
                        logging.error(traceback.format_exc())
            if once:
                break

            time_left = fts_wait - abs(time.time() - start_time)
            if time_left > 0:
                logging.debug("Waiting %s seconds until next FTS terminal state retrieval" % time_left)
                graceful_stop.wait(time_left)
        except RequestException, e:
            logging.error("Failed to contact FTS server: %s" % (str(e)))
        except:
Ejemplo n.º 7
0
def _submit_transfers(transfertool_obj, transfers, job_params, submitter='submitter', timeout=None, logger=logging.log):
    """
    helper function for submit_transfers. Performs the actual submission of one or more transfers.

    If the bulk submission of multiple transfers fails due to duplicate submissions, the exception
    is propagated to the caller context, which is then responsible for calling this function again for each
    of the transfers separately.
    """
    logger(logging.INFO, 'About to submit job to %s with timeout %s' % (transfertool_obj, timeout))
    # A eid is returned if the job is properly submitted otherwise an exception is raised
    is_bulk = len(transfers) > 1
    eid = None
    start_time = time.time()
    state_to_set = RequestState.SUBMISSION_FAILED
    try:
        record_counter('core.request.submit_transfer')
        eid = transfertool_obj.submit(transfers, job_params, timeout)
        state_to_set = RequestState.SUBMITTED
    except DuplicateFileTransferSubmission:
        if is_bulk:
            raise
    except (TransferToolTimeout, TransferToolWrongAnswer) as error:
        logger(logging.ERROR, 'Failed to submit a job with error %s', str(error), exc_info=True)
    except Exception as error:
        logger(logging.ERROR, 'Failed to submit a job with error %s', str(error), exc_info=True)
        # Keep the behavior from before the refactoring: in case of unexpected exception, only
        # update request state on individual transfers, and do nothing for bulks.
        # Don't know why it's like that.
        #
        # FIXME: shouldn't we always set the state to SUBMISSION_FAILED?
        if is_bulk:
            state_to_set = None

    if eid is not None:
        duration = time.time() - start_time
        logger(logging.INFO, 'Submit job %s to %s in %s seconds' % (eid, transfertool_obj, duration))
        record_timer('daemons.conveyor.{submitter}.submit_bulk_transfer.per_file', (time.time() - start_time) * 1000 / len(transfers) or 1, labels={'submitter': submitter})
        record_counter('daemons.conveyor.{submitter}.submit_bulk_transfer', delta=len(transfers), labels={'submitter': submitter})
        record_timer('daemons.conveyor.{submitter}.submit_bulk_transfer.files', len(transfers), labels={'submitter': submitter})

    if state_to_set:
        try:
            transfer_core.set_transfers_state(transfers, state=state_to_set, external_host=transfertool_obj.external_host,
                                              external_id=eid, submitted_at=datetime.datetime.utcnow(), logger=logger)
        except Exception:
            logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True)
            if eid is not None:
                # The job is still submitted in the file transfer service but the request is not updated.
                # Possibility to have a double submission during the next cycle. Try to cancel the external request.
                try:
                    logger(logging.INFO, 'Cancel transfer %s on %s', eid, transfertool_obj)
                    request.cancel_request_external_id(transfertool_obj, eid)
                except Exception:
                    logger(logging.ERROR, 'Failed to cancel transfers %s on %s with error' % (eid, transfertool_obj), exc_info=True)
Ejemplo n.º 8
0
def bulk_query_requests(request_host, request_ids, transfertool='fts3', session=None):
    """
    Query the status of a request.

    :param request_host: Name of the external host.
    :param request_ids: List of (Request-ID as a 32 character hex string, External-ID as a 32 character hex string)
    :param transfertool: Transfertool name as a string.
    :param session: Database session to use.
    :returns: Request status information as a dictionary.
    """

    record_counter('core.request.query_request')

    transfer_ids = []
    for request_id, external_id in request_ids:
        if external_id not in transfer_ids:
            transfer_ids.append(external_id)

    if transfertool == 'fts3':
        try:
            ts = time.time()
            fts_resps = fts3.bulk_query(transfer_ids, request_host)
            record_timer('core.request.query_bulk_request_fts3', (time.time() - ts) * 1000 / len(transfer_ids))
        except Exception:
            raise

        responses = {}
        for request_id, external_id in request_ids:
            fts_resp = fts_resps[external_id]
            if not fts_resp:
                req_status = {}
                req_status['new_state'] = RequestState.LOST
                req_status['request_id'] = request_id
            elif isinstance(fts_resp, Exception):
                req_status = fts_resp
            else:
                req_status = fts_resp
                # needed for unfinished jobs
                req_status['request_id'] = request_id

                if req_status['job_state'] in (str(FTSState.FAILED),
                                               str(FTSState.FINISHEDDIRTY),
                                               str(FTSState.CANCELED)):
                    req_status['new_state'] = RequestState.FAILED
                elif req_status['job_state'] == str(FTSState.FINISHED):
                    req_status['new_state'] = RequestState.DONE

            responses[request_id] = req_status
        return responses
    else:
        raise NotImplementedError

    return None
Ejemplo n.º 9
0
def rucio_unloadhook():
    """ Rucio unload Hook."""
    duration = time() - ctx.env['start_time']
    ip = ctx.env.get('HTTP_X_FORWARDED_FOR')
    if not ip:
        ip = ctx.ip
    # print ctx.env.get('request_id'), ctx.env.get('REQUEST_METHOD'), ctx.env.get('REQUEST_URI'), ctx.data, duration, ctx.env.get('issuer'), ip
    # Record a time serie for each REST operations
    time_serie_name = '.'.join(('http', 'methods', ctx.env.get('REQUEST_METHOD'), 'resources.'))
    time_serie_name += '.'.join(filter(None, ctx.env.get('SCRIPT_NAME').split('/'))[:4])
    if ctx.path == '/list':
        time_serie_name += '.list'
    time_serie_name = time_serie_name.replace('..', '.').lower()
    record_timer(time_serie_name, duration * 1000)
Ejemplo n.º 10
0
def cancel(tid, session):
    """
    Kills a transfer by setting its state to CANCELLED.

    :param tid: The transfer job id.
    """

    record_counter('daemons.mock.fts3.cancel')

    ts = time.time()
    query = session.query(test_models.MockFTSTransfer).filter(tid=tid)
    query.update({'state': FTSState.CANCELED,
                  'last_modified': datetime.datetime.utcnow()})
    record_timer('daemons.mock.fts3.cancel.update_state', (time.time()-ts)*1000)
Ejemplo n.º 11
0
def rucio_unloadhook():
    """ Rucio unload Hook."""
    duration = time() - ctx.env['start_time']
    ip = ctx.env.get('HTTP_X_FORWARDED_FOR')
    if not ip:
        ip = ctx.ip
    # print ctx.env.get('request_id'), ctx.env.get('REQUEST_METHOD'), ctx.env.get('REQUEST_URI'), ctx.data, duration, ctx.env.get('issuer'), ip
    # Record a time serie for each REST operations
    time_serie_name = '.'.join(('http', 'methods', ctx.env.get('REQUEST_METHOD'), 'resources.'))
    time_serie_name += '.'.join(filter(None, ctx.env.get('SCRIPT_NAME').split('/'))[:4])
    if ctx.path == '/list':
        time_serie_name += '.list'
    time_serie_name = time_serie_name.replace('..', '.').lower()
    record_timer(time_serie_name, duration * 1000)
Ejemplo n.º 12
0
    def delegate_proxy(self,
                       proxy,
                       ca_path='/etc/grid-security/certificates/',
                       duration_hours=96,
                       timeleft_hours=72):
        """Delegate user proxy to fts server if the lifetime is less than timeleft_hours

        :param proxy: proxy to be delegated
        :type proxy: str

        :param ca_path: ca path for verification, defaults to '/etc/grid-security/certificates/'
        :param ca_path: str, optional
        :param duration_hours: delegation validity duration in hours, defaults to 48
        :param duration_hours: int, optional
        :param timeleft_hours: minimal delegation time left, defaults to 12
        :param timeleft_hours: int, optional

        :return: delegation ID
        :rtype: str
        """
        logging.info("Delegating proxy %s to %s", proxy, self.external_host)
        start_time = time.time()

        try:
            context = Context(self.external_host,
                              ucert=proxy,
                              ukey=proxy,
                              verify=True,
                              capath=ca_path)
            delegation_id = delegate(
                context,
                lifetime=datetime.timedelta(hours=duration_hours),
                delegate_when_lifetime_lt=datetime.timedelta(
                    hours=timeleft_hours))
            record_timer('transfertool.fts3.delegate_proxy.success.%s' % proxy,
                         (time.time() - start_time))
        except ServerError:
            logging.error("Server side exception during FTS proxy delegation.")
            record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy,
                         (time.time() - start_time))
            raise
        except ClientError:
            logging.error("Config side exception during FTS proxy delegation.")
            record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy,
                         (time.time() - start_time))
            raise
        except BadEndpoint:
            logging.error("Wrong FTS endpoint: %s", self.external_host)
            record_timer('transfertool.fts3.delegate_proxy.fail.%s' % proxy,
                         (time.time() - start_time))
            raise
        except ReadTimeout as error:
            raise TransferToolTimeout(error)
        except JSONDecodeError as error:
            raise TransferToolWrongAnswer(error)

        logging.info("Delegated proxy %s", delegation_id)

        return delegation_id, context
Ejemplo n.º 13
0
def get_requests(rse_id=None, process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activity=None):
    ts = time.time()
    reqs = request.get_next(request_type=[RequestType.TRANSFER,
                                          RequestType.STAGEIN,
                                          RequestType.STAGEOUT],
                            state=RequestState.QUEUED,
                            limit=bulk,
                            rse=rse_id,
                            activity=activity,
                            process=process,
                            total_processes=total_processes,
                            thread=thread,
                            total_threads=total_threads)
    record_timer('daemons.conveyor.submitter.get_next', (time.time() - ts) * 1000)
    return reqs
Ejemplo n.º 14
0
def query_request(request_id, transfertool='fts3', session=None):
    """
    Query the status of a request.

    :param request_id: Request-ID as a 32 character hex string.
    :param transfertool: Transfertool name as a string.
    :param session: Database session to use.
    :returns: Request status information as a dictionary.
    """

    record_counter('core.request.query_request')

    req = get_request(request_id, session=session)

    req_status = {'request_id': request_id,
                  'new_state': None}

    if not req:
        req_status['new_state'] = RequestState.LOST
        return req_status

    if transfertool == 'fts3':
        try:
            ts = time.time()
            response = fts3.query(req['external_id'], req['external_host'])
            record_timer('core.request.query_request_fts3', (time.time() - ts) * 1000)
            req_status['details'] = response
        except Exception:
            raise

        if not response:
            req_status['new_state'] = RequestState.LOST
        else:
            if 'job_state' not in response:
                req_status['new_state'] = RequestState.LOST
            elif response['job_state'] in (str(FTSState.FAILED),
                                           str(FTSState.FINISHEDDIRTY),
                                           str(FTSState.CANCELED)):
                req_status['new_state'] = RequestState.FAILED
            elif response['job_state'] == str(FTSState.FINISHED):
                req_status['new_state'] = RequestState.DONE
    else:
        raise NotImplementedError

    return req_status
Ejemplo n.º 15
0
def poll_transfers(external_host, xfers, process=0, thread=0, timeout=None):
    try:
        try:
            tss = time.time()
            logging.info('%i:%i - polling %i transfers against %s with timeout %s' % (process, thread, len(xfers), external_host, timeout))
            resps = request_core.bulk_query_transfers(external_host, xfers, 'fts3', timeout)
            record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers))
        except RequestException as error:
            logging.error("Failed to contact FTS server: %s" % (str(error)))
            return
        except:
            logging.error("Failed to query FTS info: %s" % (traceback.format_exc()))
            return

        logging.debug('%i:%i - updating %s requests status' % (process, thread, len(xfers)))
        for transfer_id in resps:
            try:
                transf_resp = resps[transfer_id]
                # transf_resp is None: Lost.
                #             is Exception: Failed to get fts job status.
                #             is {}: No terminated jobs.
                #             is {request_id: {file_status}}: terminated jobs.
                if transf_resp is None:
                    set_transfer_state(external_host, transfer_id, RequestState.LOST)
                    record_counter('daemons.conveyor.poller.transfer_lost')
                elif isinstance(transf_resp, Exception):
                    logging.warning("Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp))
                    record_counter('daemons.conveyor.poller.query_transfer_exception')
                else:
                    for request_id in transf_resp:
                        ret = update_request_state(transf_resp[request_id])
                        # if True, really update request content; if False, only touch request
                        record_counter('daemons.conveyor.poller.update_request_state.%s' % ret)

                # should touch transfers.
                # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again.
                touch_transfer(external_host, transfer_id)
            except (DatabaseException, DatabaseError) as error:
                if isinstance(error.args[0], tuple) and (match('.*ORA-00054.*', error.args[0][0]) or match('.*ORA-00060.*', error.args[0][0]) or ('ERROR 1205 (HY000)' in error.args[0][0])):
                    logging.warn("Lock detected when handling request %s - skipping" % request_id)
                else:
                    logging.error(traceback.format_exc())
        logging.debug('%i:%i - finished updating %s requests status' % (process, thread, len(xfers)))
    except:
        logging.error(traceback.format_exc())
Ejemplo n.º 16
0
def submit(tinfo, session):
    """
    Create a new transfer job in state QUEUED.

    :param tinfo: The transfer job information as a string.
    :returns: The transfer job id.
    """

    record_counter('daemons.mock.fts3.submit')

    ts = time.time()
    tid = generate_uuid()
    record_timer('daemons.mock.fts3.submit.000-generate_uuid', (time.time()-ts)*1000)

    ts = time.time()
    new_transfer = test_models.MockFTSTransfer(transfer_id=tid, transfer_metadata=str(tinfo))
    new_transfer.save(session=session)
    record_timer('daemons.mock.fts3.submit.001-new_transfer', (time.time()-ts)*1000)

    return {'job_id': tid}
Ejemplo n.º 17
0
def submit_bulk_transfers(external_host, files, transfertool='fts3', job_params={}, timeout=None, user_transfer_job=False):
    """
    Submit transfer request to a transfertool.

    :param external_host:  External host name as string
    :param files:          List of Dictionary containing request file.
    :param transfertool:   Transfertool as a string.
    :param job_params:     Metadata key/value pairs for all files as a dictionary.
    :returns:              Transfertool external ID.
    """

    record_counter('core.request.submit_transfer')

    transfer_id = None

    if transfertool == 'fts3':
        start_time = time.time()
        job_files = []
        for file in files:
            job_file = {}
            for key in file:
                if key == 'sources':
                    # convert sources from (src_rse, url, src_rse_id, rank) to url
                    job_file[key] = []
                    for source in file[key]:
                        job_file[key].append(source[1])
                else:
                    job_file[key] = file[key]
            job_files.append(job_file)
        if not user_transfer_job:
            transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        elif USER_TRANSFERS == "cms":
            transfer_id = FTS3MyProxyTransfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        else:
            # if no valid USER TRANSFER cases --> go with std submission
            transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        record_timer('core.request.submit_transfers_fts3', (time.time() - start_time) * 1000 / len(files))
    return transfer_id
Ejemplo n.º 18
0
def bulk_query_transfers(request_host, transfer_ids, transfertool='fts3', timeout=None):
    """
    Query the status of a transfer.

    :param request_host:  Name of the external host.
    :param transfer_ids:  List of (External-ID as a 32 character hex string)
    :param transfertool:  Transfertool name as a string.
    :returns:             Request status information as a dictionary.
    """

    record_counter('core.request.bulk_query_transfers')

    if transfertool == 'fts3':
        try:
            start_time = time.time()
            fts_resps = FTS3Transfertool(external_host=request_host).bulk_query(transfer_ids=transfer_ids, timeout=timeout)
            record_timer('core.request.bulk_query_transfers', (time.time() - start_time) * 1000 / len(transfer_ids))
        except Exception:
            raise

        for transfer_id in transfer_ids:
            if transfer_id not in fts_resps:
                fts_resps[transfer_id] = Exception("Transfer id %s is not returned" % transfer_id)
            if fts_resps[transfer_id] and not isinstance(fts_resps[transfer_id], Exception):
                for request_id in fts_resps[transfer_id]:
                    if fts_resps[transfer_id][request_id]['file_state'] in (str(FTSState.FAILED),
                                                                            str(FTSState.FINISHEDDIRTY),
                                                                            str(FTSState.CANCELED)):
                        fts_resps[transfer_id][request_id]['new_state'] = RequestState.FAILED
                    elif fts_resps[transfer_id][request_id]['file_state'] in str(FTSState.FINISHED):
                        fts_resps[transfer_id][request_id]['new_state'] = RequestState.DONE
        return fts_resps
    else:
        raise NotImplementedError

    return None
Ejemplo n.º 19
0
def run_once(bulk, group_bulk, rse_ids, scheme, failover_scheme, transfertool_kwargs, heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    start_time = time.time()
    transfers = next_transfers_to_submit(
        total_workers=total_workers,
        worker_number=worker_number,
        failover_schemes=failover_scheme,
        limit=bulk,
        activity=activity,
        rses=rse_ids,
        schemes=scheme,
        transfertools_by_name={'fts3': FTS3Transfertool},
        older_than=None,
        request_type=RequestType.STAGEIN,
        logger=logger,
    )
    total_transfers = len(list(hop for paths in transfers.values() for path in paths for hop in path))
    record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1))
    record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers)
    record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers)
    logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity))

    for builder, transfer_paths in transfers.items():
        transfertool_obj = builder.make_transfertool(logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {}))
        logger(logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj))
        start_time = time.time()
        grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths)
        record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1))

        logger(logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj))
        for job in grouped_jobs:
            worker_number, total_workers, logger = heartbeat_handler.live()
            submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger)

    queue_empty = False
    if total_transfers < group_bulk:
        queue_empty = True
        logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s' % (total_transfers, activity, group_bulk))
    return queue_empty
Ejemplo n.º 20
0
def transmogrifier(bulk=5, once=False, sleep_time=60):
    """
    Creates a Transmogrifier Worker that gets a list of new DIDs for a given hash,
    identifies the subscriptions matching the DIDs and
    submit a replication rule for each DID matching a subscription.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    :param sleep_time: Time between two cycles.
    """

    executable = 'transmogrifier'
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)

        dids, subscriptions = [], []
        tottime = 0
        prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                             heart_beat['nr_threads'])

        try:
            #  Get the new DIDs based on the is_new flag
            for did in list_new_dids(thread=heart_beat['assign_thread'],
                                     total_threads=heart_beat['nr_threads'],
                                     chunk_size=bulk,
                                     did_type=None):
                dids.append({
                    'scope': did['scope'],
                    'did_type': str(did['did_type']),
                    'name': did['name']
                })

            sub_dict = {3: []}
            #  Get the list of subscriptions. The default priority of the subscription is 3. 0 is the highest priority, 5 the lowest
            #  The priority is defined as 'policyid'
            for sub in list_subscriptions(None, None):
                if sub['state'] != SubscriptionState.INACTIVE and sub[
                        'lifetime'] and (datetime.now() > sub['lifetime']):
                    update_subscription(
                        name=sub['name'],
                        account=sub['account'],
                        metadata={'state': SubscriptionState.INACTIVE})

                elif sub['state'] in [
                        SubscriptionState.ACTIVE, SubscriptionState.UPDATED
                ]:
                    priority = 3
                    if 'policyid' in sub:
                        if int(sub['policyid']) not in sub_dict:
                            sub_dict[int(sub['policyid'])] = []
                        priority = int(sub['policyid'])
                    sub_dict[priority].append(sub)
            priorities = list(sub_dict.keys())
            priorities.sort()
            #  Order the subscriptions according to their priority
            for priority in priorities:
                subscriptions.extend(sub_dict[priority])
        except SubscriptionNotFound as error:
            logging.warning(prepend_str + 'No subscriptions defined: %s' %
                            (str(error)))
            time.sleep(10)
            continue
        except Exception as error:
            logging.error(
                prepend_str +
                'Failed to get list of new DIDs or subscriptions: %s' %
                (str(error)))

        try:
            results = {}
            start_time = time.time()
            blacklisted_rse_id = [
                rse['id'] for rse in list_rses({'availability_write': False})
            ]
            logging.debug(prepend_str + 'In transmogrifier worker')
            identifiers = []
            #  Loop over all the new dids
            for did in dids:
                did_success = True
                if did['did_type'] == str(
                        DIDType.DATASET) or did['did_type'] == str(
                            DIDType.CONTAINER):
                    did_tag = '%s:%s' % (did['scope'].internal, did['name'])
                    results[did_tag] = []
                    try:
                        metadata = get_metadata(did['scope'], did['name'])
                        # Loop over all the subscriptions
                        for subscription in subscriptions:
                            #  Check if the DID match the subscription
                            if is_matching_subscription(
                                    subscription, did, metadata) is True:
                                filter_string = loads(subscription['filter'])
                                split_rule = filter_string.get(
                                    'split_rule', False)
                                stime = time.time()
                                results[did_tag].append(subscription['id'])
                                logging.info(prepend_str +
                                             '%s:%s matches subscription %s' %
                                             (did['scope'], did['name'],
                                              subscription['name']))
                                rules = loads(
                                    subscription['replication_rules'])
                                created_rules = {}
                                cnt = 0
                                for rule_dict in rules:
                                    cnt += 1
                                    created_rules[cnt] = []
                                    # Get all the rule and subscription parameters
                                    grouping = rule_dict.get(
                                        'grouping', 'DATASET')
                                    lifetime = rule_dict.get('lifetime', None)
                                    ignore_availability = rule_dict.get(
                                        'ignore_availability', None)
                                    weight = rule_dict.get('weight', None)
                                    source_replica_expression = rule_dict.get(
                                        'source_replica_expression', None)
                                    locked = rule_dict.get('locked', None)
                                    if locked == 'True':
                                        locked = True
                                    else:
                                        locked = False
                                    purge_replicas = rule_dict.get(
                                        'purge_replicas', False)
                                    if purge_replicas == 'True':
                                        purge_replicas = True
                                    else:
                                        purge_replicas = False
                                    rse_expression = str(
                                        rule_dict['rse_expression'])
                                    comment = str(subscription['comments'])
                                    subscription_id = str(subscription['id'])
                                    account = subscription['account']
                                    copies = int(rule_dict['copies'])
                                    activity = rule_dict.get(
                                        'activity', 'User Subscriptions')
                                    try:
                                        validate_schema(name='activity',
                                                        obj=activity)
                                    except InputValidationError as error:
                                        logging.error(
                                            prepend_str +
                                            'Error validating the activity %s'
                                            % (str(error)))
                                        activity = 'User Subscriptions'
                                    if lifetime:
                                        lifetime = int(lifetime)

                                    str_activity = "".join(activity.split())
                                    success = False
                                    nattempt = 5
                                    attemptnr = 0
                                    skip_rule_creation = False

                                    selected_rses = []
                                    chained_idx = rule_dict.get(
                                        'chained_idx', None)
                                    if chained_idx:
                                        params = {}
                                        if rule_dict.get(
                                                'associated_site_idx', None):
                                            params[
                                                'associated_site_idx'] = rule_dict.get(
                                                    'associated_site_idx',
                                                    None)
                                        logging.debug(
                                            '%s Chained subscription identified. Will use %s',
                                            prepend_str,
                                            str(created_rules[chained_idx]))
                                        algorithm = rule_dict.get(
                                            'algorithm', None)
                                        selected_rses = select_algorithm(
                                            algorithm,
                                            created_rules[chained_idx], params)
                                    else:
                                        # In the case of chained subscription, don't use rseselector but use the rses returned by the algorithm
                                        if split_rule:
                                            vo = account.vo
                                            rses = parse_expression(
                                                rse_expression,
                                                filter={'vo': vo})
                                            list_of_rses = [
                                                rse['id'] for rse in rses
                                            ]
                                            # Check that some rule doesn't already exist for this DID and subscription
                                            preferred_rse_ids = []
                                            for rule in list_rules(
                                                    filters={
                                                        'subscription_id':
                                                        subscription_id,
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }):
                                                already_existing_rses = [
                                                    (rse['rse'], rse['id']) for
                                                    rse in parse_expression(
                                                        rule['rse_expression'],
                                                        filter={'vo': vo})
                                                ]
                                                for rse, rse_id in already_existing_rses:
                                                    if (rse_id in list_of_rses
                                                        ) and (
                                                            rse_id not in
                                                            preferred_rse_ids):
                                                        preferred_rse_ids.append(
                                                            rse_id)
                                            if len(preferred_rse_ids
                                                   ) >= copies:
                                                skip_rule_creation = True
                                            rse_id_dict = {}
                                            for rse in rses:
                                                rse_id_dict[
                                                    rse['id']] = rse['rse']
                                            try:
                                                rseselector = RSESelector(
                                                    account=account,
                                                    rses=rses,
                                                    weight=weight,
                                                    copies=copies -
                                                    len(preferred_rse_ids))
                                                selected_rses = [
                                                    rse_id_dict[rse_id]
                                                    for rse_id, _, _ in
                                                    rseselector.select_rse(
                                                        0,
                                                        preferred_rse_ids=
                                                        preferred_rse_ids,
                                                        copies=copies,
                                                        blacklist=
                                                        blacklisted_rse_id)
                                                ]
                                            except (InsufficientTargetRSEs,
                                                    InsufficientAccountLimit,
                                                    InvalidRuleWeight,
                                                    RSEOverQuota) as error:
                                                logging.warning(
                                                    prepend_str +
                                                    'Problem getting RSEs for subscription "%s" for account %s : %s. Try including blacklisted sites'
                                                    % (subscription['name'],
                                                       account, str(error)))
                                                # Now including the blacklisted sites
                                                try:
                                                    rseselector = RSESelector(
                                                        account=account,
                                                        rses=rses,
                                                        weight=weight,
                                                        copies=copies -
                                                        len(preferred_rse_ids))
                                                    selected_rses = [
                                                        rse_id_dict[rse_id]
                                                        for rse_id, _, _ in
                                                        rseselector.select_rse(
                                                            0,
                                                            preferred_rse_ids=
                                                            preferred_rse_ids,
                                                            copies=copies,
                                                            blacklist=[])
                                                    ]
                                                    ignore_availability = True
                                                except (InsufficientTargetRSEs,
                                                        InsufficientAccountLimit,
                                                        InvalidRuleWeight,
                                                        RSEOverQuota) as error:
                                                    logging.error(
                                                        prepend_str +
                                                        'Problem getting RSEs for subscription "%s" for account %s : %s. Skipping rule creation.'
                                                        %
                                                        (subscription['name'],
                                                         account, str(error)))
                                                    monitor.record_counter(
                                                        counters=
                                                        'transmogrifier.addnewrule.errortype.%s'
                                                        % (str(error.__class__.
                                                               __name__)),
                                                        delta=1)
                                                    # The DID won't be reevaluated at the next cycle
                                                    did_success = did_success and True
                                                    continue

                                    for attempt in range(0, nattempt):
                                        attemptnr = attempt
                                        nb_rule = 0
                                        #  Try to create the rule
                                        try:
                                            if split_rule:
                                                if not skip_rule_creation:
                                                    for rse in selected_rses:
                                                        if isinstance(
                                                                selected_rses,
                                                                dict):
                                                            source_replica_expression = selected_rses[
                                                                rse].get(
                                                                    'source_replica_expression',
                                                                    None)
                                                            weight = selected_rses[
                                                                rse].get(
                                                                    'weight',
                                                                    None)
                                                        logging.info(
                                                            prepend_str +
                                                            'Will insert one rule for %s:%s on %s'
                                                            %
                                                            (did['scope'],
                                                             did['name'], rse))
                                                        rule_ids = add_rule(
                                                            dids=[{
                                                                'scope':
                                                                did['scope'],
                                                                'name':
                                                                did['name']
                                                            }],
                                                            account=account,
                                                            copies=1,
                                                            rse_expression=rse,
                                                            grouping=grouping,
                                                            weight=weight,
                                                            lifetime=lifetime,
                                                            locked=locked,
                                                            subscription_id=
                                                            subscription_id,
                                                            source_replica_expression
                                                            =source_replica_expression,
                                                            activity=activity,
                                                            purge_replicas=
                                                            purge_replicas,
                                                            ignore_availability=
                                                            ignore_availability,
                                                            comment=comment)
                                                        created_rules[
                                                            cnt].append(
                                                                rule_ids[0])
                                                        nb_rule += 1
                                                        if nb_rule == copies:
                                                            success = True
                                                            break
                                            else:
                                                rule_ids = add_rule(
                                                    dids=[{
                                                        'scope': did['scope'],
                                                        'name': did['name']
                                                    }],
                                                    account=account,
                                                    copies=copies,
                                                    rse_expression=
                                                    rse_expression,
                                                    grouping=grouping,
                                                    weight=weight,
                                                    lifetime=lifetime,
                                                    locked=locked,
                                                    subscription_id=
                                                    subscription['id'],
                                                    source_replica_expression=
                                                    source_replica_expression,
                                                    activity=activity,
                                                    purge_replicas=
                                                    purge_replicas,
                                                    ignore_availability=
                                                    ignore_availability,
                                                    comment=comment)
                                                created_rules[cnt].append(
                                                    rule_ids[0])
                                                nb_rule += 1
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.done',
                                                delta=nb_rule)
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.activity.%s'
                                                % str_activity,
                                                delta=nb_rule)
                                            success = True
                                            break
                                        except (InvalidReplicationRule,
                                                InvalidRuleWeight,
                                                InvalidRSEExpression,
                                                StagingAreaRuleRequiresLifetime,
                                                DuplicateRule) as error:
                                            # Errors that won't be retried
                                            success = True
                                            logging.error(prepend_str + '%s' %
                                                          (str(error)))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                            break
                                        except (ReplicationRuleCreationTemporaryFailed,
                                                InsufficientTargetRSEs,
                                                InsufficientAccountLimit,
                                                DatabaseException,
                                                RSEBlacklisted,
                                                RSEWriteBlocked) as error:
                                            # Errors to be retried
                                            logging.error(
                                                prepend_str +
                                                '%s Will perform an other attempt %i/%i'
                                                % (str(error), attempt + 1,
                                                   nattempt))
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.%s'
                                                % (str(
                                                    error.__class__.__name__)),
                                                delta=1)
                                        except Exception:
                                            # Unexpected errors
                                            monitor.record_counter(
                                                counters=
                                                'transmogrifier.addnewrule.errortype.unknown',
                                                delta=1)
                                            exc_type, exc_value, exc_traceback = exc_info(
                                            )
                                            logging.critical(
                                                prepend_str + ''.join(
                                                    format_exception(
                                                        exc_type, exc_value,
                                                        exc_traceback)).strip(
                                                        ))

                                    did_success = (did_success and success)
                                    if (attemptnr +
                                            1) == nattempt and not success:
                                        logging.error(
                                            prepend_str +
                                            'Rule for %s:%s on %s cannot be inserted'
                                            % (did['scope'], did['name'],
                                               rse_expression))
                                    else:
                                        logging.info(
                                            prepend_str +
                                            '%s rule(s) inserted in %f seconds'
                                            % (str(nb_rule),
                                               time.time() - stime))
                    except DataIdentifierNotFound as error:
                        logging.warning(prepend_str + error)

                if did_success:
                    if did['did_type'] == str(DIDType.FILE):
                        monitor.record_counter(
                            counters='transmogrifier.did.file.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.DATASET):
                        monitor.record_counter(
                            counters='transmogrifier.did.dataset.processed',
                            delta=1)
                    elif did['did_type'] == str(DIDType.CONTAINER):
                        monitor.record_counter(
                            counters='transmogrifier.did.container.processed',
                            delta=1)
                    monitor.record_counter(
                        counters='transmogrifier.did.processed', delta=1)
                    identifiers.append({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        DIDType.from_sym(did['did_type'])
                    })

            time1 = time.time()

            #  Mark the DIDs as processed
            for identifier in chunks(identifiers, 100):
                _retrial(set_new_dids, identifier, None)

            logging.info(prepend_str + 'Time to set the new flag : %f' %
                         (time.time() - time1))
            tottime = time.time() - start_time
            for sub in subscriptions:
                update_subscription(
                    name=sub['name'],
                    account=sub['account'],
                    metadata={'last_processed': datetime.now()})
            logging.info(prepend_str +
                         'It took %f seconds to process %i DIDs' %
                         (tottime, len(dids)))
            logging.debug(prepend_str + 'DIDs processed : %s' % (str(dids)))
            monitor.record_counter(counters='transmogrifier.job.done', delta=1)
            monitor.record_timer(stat='transmogrifier.job.duration',
                                 time=1000 * tottime)
        except Exception:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical(prepend_str + ''.join(
                format_exception(exc_type, exc_value, exc_traceback)).strip())
            monitor.record_counter(counters='transmogrifier.job.error',
                                   delta=1)
            monitor.record_counter(counters='transmogrifier.addnewrule.error',
                                   delta=1)
        if once is True:
            break
        if tottime < sleep_time:
            logging.info(prepend_str + 'Will sleep for %s seconds' %
                         (sleep_time - tottime))
            time.sleep(sleep_time - tottime)
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 21
0
def submit_bulk_transfers(external_host, files, job_params, timeout=None):
    """
    Submit a transfer to FTS3 via JSON.

    :param external_host: FTS server as a string.
    :param files: List of dictionary which for a transfer.
    :param job_params: Dictionary containing key/value pairs, for all transfers.
    :returns: FTS transfer identifier.
    """

    # FTS3 expects 'davs' as the scheme identifier instead of https
    for file in files:
        if not file['sources'] or file['sources'] == []:
            raise Exception('No sources defined')

        new_src_urls = []
        new_dst_urls = []
        for url in file['sources']:
            if url.startswith('https'):
                new_src_urls.append(':'.join(['davs'] + url.split(':')[1:]))
            else:
                new_src_urls.append(url)
        for url in file['destinations']:
            if url.startswith('https'):
                new_dst_urls.append(':'.join(['davs'] + url.split(':')[1:]))
            else:
                new_dst_urls.append(url)

        file['sources'] = new_src_urls
        file['destinations'] = new_dst_urls

    transfer_id = None
    expected_transfer_id = None
    if __USE_DETERMINISTIC_ID:
        job_params = job_params.copy()
        job_params["id_generator"] = "deterministic"
        job_params["sid"] = files[0]['metadata']['request_id']
        expected_transfer_id = get_deterministic_id(external_host, job_params["sid"])
        logging.debug("Submit bulk transfers in deterministic mode, sid %s, expected transfer id: %s" % (job_params["sid"], expected_transfer_id))

    # bulk submission
    params_dict = {'files': files, 'params': job_params}
    params_str = json.dumps(params_dict)

    r = None
    if external_host.startswith('https://'):
        try:
            ts = time.time()
            r = requests.post('%s/jobs' % external_host,
                              verify=False,
                              cert=(__USERCERT, __USERCERT),
                              data=params_str,
                              headers={'Content-Type': 'application/json'},
                              timeout=timeout)
            record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(external_host), (time.time() - ts) * 1000 / len(files))
        except:
            logging.warn('Could not submit transfer to %s - %s' % (external_host, str(traceback.format_exc())))
    else:
        try:
            ts = time.time()
            r = requests.post('%s/jobs' % external_host,
                              data=params_str,
                              headers={'Content-Type': 'application/json'},
                              timeout=timeout)
            record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(external_host), (time.time() - ts) * 1000 / len(files))
        except:
            logging.warn('Could not submit transfer to %s - %s' % (external_host, str(traceback.format_exc())))

    if r and r.status_code == 200:
        record_counter('transfertool.fts3.%s.submission.success' % __extract_host(external_host), len(files))
        transfer_id = str(r.json()['job_id'])
    else:
        if expected_transfer_id:
            transfer_id = expected_transfer_id
            logging.warn("Failed to submit transfer to %s, will use expected transfer id %s, error: %s" % (external_host, transfer_id, r.text if r is not None else r))
        else:
            logging.warn("Failed to submit transfer to %s, error: %s" % (external_host, r.text if r is not None else r))
        record_counter('transfertool.fts3.%s.submission.failure' % __extract_host(external_host), len(files))

    return transfer_id
Ejemplo n.º 22
0
def submit_transfers(transfers, job_metadata):
    """
    Submit a transfer to FTS3 via JSON.

    :param transfers: Dictionary containing 'request_id', 'src_urls', 'dest_urls', 'filesize', 'md5', 'adler32', 'overwrite', 'job_metadata', 'src_spacetoken', 'dest_spacetoken'
    :param job_metadata: Dictionary containing key/value pairs, for all transfers.
    :param transfer_host: FTS server as a string.
    :returns: List of FTS transfer identifiers
    """

    # Early sanity check
    for transfer in transfers:
        if not transfer['sources'] or transfer['sources'] == []:
            raise Exception('No sources defined')
        transfer['src_urls'] = []
        for source in transfer['sources']:
            # # convert sources from (src_rse, url, src_rse_id, rank) to url
            transfer['src_urls'].append(source[1])

    # FTS3 expects 'davs' as the scheme identifier instead of https
    new_src_urls = []
    new_dst_urls = []
    for transfer in transfers:
        for url in transfer['src_urls']:
            if url.startswith('https'):
                new_src_urls.append(':'.join(['davs'] + url.split(':')[1:]))
            else:
                new_src_urls.append(url)
        for url in transfer['dest_urls']:
            if url.startswith('https'):
                new_dst_urls.append(':'.join(['davs'] + url.split(':')[1:]))
            else:
                new_dst_urls.append(url)

    transfer['src_urls'] = new_src_urls
    transfer['dest_urls'] = new_dst_urls

    # Rewrite the checksums into FTS3 format, prefer adler32 if available
    for transfer in transfers:
        transfer['checksum'] = None
        if 'md5' in transfer.keys() and transfer['md5']:
            transfer['checksum'] = 'MD5:%s' % str(transfer['md5'])
        if 'adler32' in transfer.keys() and transfer['adler32']:
            transfer['checksum'] = 'ADLER32:%s' % str(transfer['adler32'])

    transfer_ids = {}

    job_metadata['issuer'] = 'rucio'
    job_metadata['previous_attempt_id'] = None

    # we have to loop until we get proper fts3 bulk submission
    for transfer in transfers:

        job_metadata['request_id'] = transfer['request_id']

        if 'previous_attempt_id' in transfer.keys():
            job_metadata['previous_attempt_id'] = transfer['previous_attempt_id']

        params_dict = {'files': [{'sources': transfer['src_urls'],
                                  'destinations': transfer['dest_urls'],
                                  'metadata': {'issuer': 'rucio'},
                                  'filesize': int(transfer['filesize']),
                                  'checksum': str(transfer['checksum']),
                                  'activity': str(transfer['activity']),
                                  'selection_strategy': transfer.get('selection_strategy', 'auto')}],
                       'params': {'verify_checksum': True if transfer['checksum'] else False,
                                  'spacetoken': transfer['dest_spacetoken'] if transfer['dest_spacetoken'] else 'null',
                                  'copy_pin_lifetime': transfer['copy_pin_lifetime'] if transfer['copy_pin_lifetime'] else -1,
                                  'bring_online': transfer['bring_online'] if transfer['bring_online'] else None,
                                  'job_metadata': job_metadata,
                                  'source_spacetoken': transfer['src_spacetoken'] if transfer['src_spacetoken'] else None,
                                  'overwrite': transfer['overwrite'],
                                  'priority': 3}}

        r = None
        params_str = json.dumps(params_dict)

        transfer_host = transfer['external_host']
        if transfer_host.startswith('https://'):
            try:
                ts = time.time()
                r = requests.post('%s/jobs' % transfer_host,
                                  verify=False,
                                  cert=(__USERCERT, __USERCERT),
                                  data=params_str,
                                  headers={'Content-Type': 'application/json'},
                                  timeout=5)
                record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(transfer_host), (time.time() - ts) * 1000)
            except:
                logging.warn('Could not submit transfer to %s' % transfer_host)
        else:
            try:
                ts = time.time()
                r = requests.post('%s/jobs' % transfer_host,
                                  data=params_str,
                                  headers={'Content-Type': 'application/json'},
                                  timeout=5)
                record_timer('transfertool.fts3.submit_transfer.%s' % __extract_host(transfer_host), (time.time() - ts) * 1000)
            except:
                logging.warn('Could not submit transfer to %s' % transfer_host)

        if r and r.status_code == 200:
            record_counter('transfertool.fts3.%s.submission.success' % __extract_host(transfer_host))
            transfer_ids[transfer['request_id']] = {'external_id': str(r.json()['job_id']),
                                                    'dest_urls': transfer['dest_urls'],
                                                    'external_host': transfer_host}
        else:
            logging.warn("Failed to submit transfer to %s, error: %s" % (transfer_host, r.text if r is not None else r))
            record_counter('transfertool.fts3.%s.submission.failure' % __extract_host(transfer_host))

    return transfer_ids
Ejemplo n.º 23
0
    def __update_atime(self):
        """
        Bulk update atime.
        """
        replicas = []
        rses = []
        for report in self.__reports:
            if 'vo' not in report:
                report['vo'] = 'def'

            try:
                # Identify suspicious files
                try:
                    if self.__bad_files_patterns and report['eventType'] in [
                            'get_sm', 'get_sm_a', 'get'
                    ] and 'clientState' in report and report[
                            'clientState'] not in [
                                'DONE', 'FOUND_ROOT', 'ALREADY_DONE'
                            ]:
                        for pattern in self.__bad_files_patterns:
                            if 'stateReason' in report and report[
                                    'stateReason'] and isinstance(
                                        report['stateReason'],
                                        str) and pattern.match(
                                            report['stateReason']):
                                reason = report['stateReason'][:255]
                                if 'url' not in report or not report['url']:
                                    self.__logger(
                                        logging.ERROR,
                                        'Missing url in the following trace : '
                                        + str(report))
                                else:
                                    try:
                                        surl = report['url']
                                        declare_bad_file_replicas(
                                            [
                                                surl,
                                            ],
                                            reason=reason,
                                            issuer=InternalAccount(
                                                'root', vo=report['vo']),
                                            status=BadFilesStatus.SUSPICIOUS)
                                        self.__logger(
                                            logging.INFO,
                                            'Declare suspicious file %s with reason %s'
                                            % (report['url'], reason))
                                    except Exception as error:
                                        self.__logger(
                                            logging.ERROR,
                                            'Failed to declare suspicious file'
                                            + str(error))
                except Exception as error:
                    self.__logger(
                        logging.ERROR,
                        'Problem with bad trace : %s . Error %s' %
                        (str(report), str(error)))

                # check if scope in report. if not skip this one.
                if 'scope' not in report:
                    record_counter('daemons.tracer.kronos.missing_scope')
                    if report['eventType'] != 'touch':
                        continue
                else:
                    record_counter('daemons.tracer.kronos.with_scope')
                    report['scope'] = InternalScope(report['scope'],
                                                    report['vo'])

                # handle all events starting with get* and download and touch events.
                if not report['eventType'].startswith('get') and not report[
                        'eventType'].startswith('sm_get') and not report[
                            'eventType'] == 'download' and not report[
                                'eventType'] == 'touch':
                    continue
                if report['eventType'].endswith('_es'):
                    continue
                record_counter('daemons.tracer.kronos.total_get')
                if report['eventType'] == 'get':
                    record_counter('daemons.tracer.kronos.dq2clients')
                elif report['eventType'] == 'get_sm' or report[
                        'eventType'] == 'sm_get':
                    if report['eventVersion'] == 'aCT':
                        record_counter(
                            'daemons.tracer.kronos.panda_production_act')
                    else:
                        record_counter(
                            'daemons.tracer.kronos.panda_production')
                elif report['eventType'] == 'get_sm_a' or report[
                        'eventType'] == 'sm_get_a':
                    if report['eventVersion'] == 'aCT':
                        record_counter(
                            'daemons.tracer.kronos.panda_analysis_act')
                    else:
                        record_counter('daemons.tracer.kronos.panda_analysis')
                elif report['eventType'] == 'download':
                    record_counter('daemons.tracer.kronos.rucio_download')
                elif report['eventType'] == 'touch':
                    record_counter('daemons.tracer.kronos.rucio_touch')
                else:
                    record_counter('daemons.tracer.kronos.other_get')

                if report['eventType'] == 'download' or report[
                        'eventType'] == 'touch':
                    report['usrdn'] = report['account']

                if report['usrdn'] in self.__excluded_usrdns:
                    continue
                # handle touch and non-touch traces differently
                if report['eventType'] != 'touch':
                    # check if the report has the right state.
                    if 'eventVersion' in report:
                        if report['eventVersion'] != 'aCT':
                            if report['clientState'] in self.__excluded_states:
                                continue

                    if 'remoteSite' not in report:
                        continue
                    if not report['remoteSite']:
                        continue

                    if 'filename' not in report:
                        if 'name' in report:
                            report['filename'] = report['name']

                    rses = report['remoteSite'].strip().split(',')
                    for rse in rses:
                        try:
                            rse_id = get_rse_id(rse=rse, vo=report['vo'])
                        except RSENotFound:
                            self.__logger(
                                logging.WARNING,
                                "Cannot lookup rse_id for %s. Will skip this report.",
                                rse)
                            record_counter(
                                'daemons.tracer.kronos.rse_not_found')
                            continue
                        replicas.append({
                            'name':
                            report['filename'],
                            'scope':
                            report['scope'],
                            'rse':
                            rse,
                            'rse_id':
                            rse_id,
                            'accessed_at':
                            datetime.utcfromtimestamp(
                                report['traceTimeentryUnix']),
                            'traceTimeentryUnix':
                            report['traceTimeentryUnix'],
                            'eventVersion':
                            report['eventVersion']
                        })
                else:
                    # if touch event and if datasetScope is in the report then it means
                    # that there is no file scope/name and therefore only the dataset is
                    # put in the queue to be updated and the rest is skipped.
                    rse_id = None
                    rse = None
                    if 'remoteSite' in report:
                        rse = report['remoteSite']
                        try:
                            rse_id = get_rse_id(rse=rse, vo=report['vo'])
                        except RSENotFound:
                            self.__logger(logging.WARNING,
                                          "Cannot lookup rse_id for %s.", rse)
                            record_counter(
                                'daemons.tracer.kronos.rse_not_found')
                    if 'datasetScope' in report:
                        self.__dataset_queue.put({
                            'scope':
                            InternalScope(report['datasetScope'],
                                          vo=report['vo']),
                            'name':
                            report['dataset'],
                            'rse_id':
                            rse_id,
                            'accessed_at':
                            datetime.utcfromtimestamp(
                                report['traceTimeentryUnix'])
                        })
                        continue
                    else:
                        if 'remoteSite' not in report:
                            continue
                        replicas.append({
                            'name':
                            report['filename'],
                            'scope':
                            report['scope'],
                            'rse':
                            rse,
                            'rse_id':
                            rse_id,
                            'accessed_at':
                            datetime.utcfromtimestamp(
                                report['traceTimeentryUnix'])
                        })

            except (KeyError, AttributeError):
                self.__logger(logging.ERROR,
                              "Cannot handle report.",
                              exc_info=True)
                record_counter('daemons.tracer.kronos.report_error')
                continue
            except Exception:
                self.__logger(logging.ERROR, "Exception", exc_info=True)
                continue

            for did in list_parent_dids(report['scope'], report['filename']):
                if did['type'] != DIDType.DATASET:
                    continue
                # do not update _dis datasets
                if did['scope'].external == 'panda' and '_dis' in did['name']:
                    continue
                for rse in rses:
                    try:
                        rse_id = get_rse_id(rse=rse, vo=report['vo'])
                    except RSENotFound:
                        self.__logger(
                            logging.WARNING,
                            "Cannot lookup rse_id for %s. Will skip this report.",
                            rse)
                        record_counter('daemons.tracer.kronos.rse_not_found')
                        continue
                    self.__dataset_queue.put({
                        'scope':
                        did['scope'],
                        'name':
                        did['name'],
                        'did_type':
                        did['type'],
                        'rse_id':
                        rse_id,
                        'accessed_at':
                        datetime.utcfromtimestamp(report['traceTimeentryUnix'])
                    })

        if not len(replicas):
            return

        self.__logger(logging.DEBUG, "trying to update replicas: %s", replicas)

        try:
            start_time = time()
            for replica in replicas:
                # if touch replica hits a locked row put the trace back into queue for later retry
                if not touch_replica(replica):
                    resubmit = {
                        'filename': replica['name'],
                        'scope': replica['scope'].external,
                        'remoteSite': replica['rse'],
                        'traceTimeentryUnix': replica['traceTimeentryUnix'],
                        'eventType': 'get',
                        'usrdn': 'someuser',
                        'clientState': 'DONE',
                        'eventVersion': replica['eventVersion']
                    }
                    if replica['scope'].vo != 'def':
                        resubmit['vo'] = replica['scope'].vo
                    self.__conn.send(body=jdumps(resubmit),
                                     destination=self.__queue,
                                     headers={
                                         'appversion': 'rucio',
                                         'resubmitted': '1'
                                     })
                    record_counter('daemons.tracer.kronos.sent_resubmitted')
                    self.__logger(logging.WARNING,
                                  'hit locked row, resubmitted to queue')
            record_timer('daemons.tracer.kronos.update_atime',
                         (time() - start_time) * 1000)
        except Exception:
            self.__logger(logging.ERROR,
                          "Cannot update replicas.",
                          exc_info=True)
            record_counter('daemons.tracer.kronos.update_error')

        self.__logger(logging.INFO, 'updated %d replica(s)' % len(replicas))
Ejemplo n.º 24
0
                logging.error("Invalid RSE exception %s for request %s: %s" % (source_replica_expression,
                                                                               req['request_id'],
                                                                               e))
                allowed_source_rses = []
            else:
                allowed_source_rses = [x['rse'] for x in parsed_rses]

    tmpsrc = []
    metadata = {}
    try:
        ts = time.time()
        replications = replica.list_replicas(dids=[{'scope': req['scope'],
                                                    'name': req['name'],
                                                    'type': DIDType.FILE}],
                                             schemes=[scheme, 'gsiftp'])
        record_timer('daemons.conveyor.submitter.list_replicas', (time.time() - ts) * 1000)

        # return gracefully if there are no replicas for a DID
        if not replications:
            return None, None

        for source in replications:

            metadata['filesize'] = long(source['bytes'])
            metadata['md5'] = source['md5']
            metadata['adler32'] = source['adler32']
            # TODO: Source protection

            # we need to know upfront if we are mixed DISK/TAPE source
            mixed_source = []
            for source_rse in source['rses']:
Ejemplo n.º 25
0
def delete_from_storage(replicas, prot, rse_info, staging_areas, prepend_str):
    deleted_files = []
    rse_name = rse_info['rse']
    rse_id = rse_info['id']
    try:
        prot.connect()
        for replica in replicas:
            # Physical deletion
            try:
                deletion_dict = {
                    'scope': replica['scope'].external,
                    'name': replica['name'],
                    'rse': rse_name,
                    'file-size': replica['bytes'],
                    'bytes': replica['bytes'],
                    'url': replica['pfn'],
                    'protocol': prot.attributes['scheme']
                }
                logging.info('%s Deletion ATTEMPT of %s:%s as %s on %s',
                             prepend_str, replica['scope'], replica['name'],
                             replica['pfn'], rse_name)
                start = time.time()
                # For STAGING RSEs, no physical deletion
                if rse_id in staging_areas:
                    logging.warning(
                        '%s Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                        prepend_str, replica['scope'], replica['name'],
                        replica['pfn'], rse_name)
                    deleted_files.append({
                        'scope': replica['scope'],
                        'name': replica['name']
                    })
                    continue

                if replica['pfn']:
                    pfn = replica['pfn']
                    # sign the URL if necessary
                    if prot.attributes['scheme'] == 'https' and rse_info[
                            'sign_url'] is not None:
                        pfn = get_signed_url(rse_info['sign_url'], 'delete',
                                             pfn)
                    prot.delete(pfn)
                else:
                    logging.warning(
                        '%s Deletion UNAVAILABLE of %s:%s as %s on %s',
                        prepend_str, replica['scope'], replica['name'],
                        replica['pfn'], rse_name)

                monitor.record_timer(
                    'daemons.reaper.delete.%s.%s' %
                    (prot.attributes['scheme'], rse_name),
                    (time.time() - start) * 1000)
                duration = time.time() - start

                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })

                deletion_dict['duration'] = duration
                add_message('deletion-done', deletion_dict)
                logging.info(
                    '%s Deletion SUCCESS of %s:%s as %s on %s in %s seconds',
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name, duration)

            except SourceNotFound:
                err_msg = '%s Deletion NOTFOUND of %s:%s as %s on %s' % (
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name)
                logging.warning(err_msg)
                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })
                if replica['state'] == ReplicaState.AVAILABLE:
                    deletion_dict['reason'] = str(err_msg)
                    add_message('deletion-failed', deletion_dict)

            except (ServiceUnavailable, RSEAccessDenied,
                    ResourceTemporaryUnavailable) as error:
                logging.warning(
                    '%s Deletion NOACCESS of %s:%s as %s on %s: %s',
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name, str(error))
                deletion_dict['reason'] = str(error)
                add_message('deletion-failed', deletion_dict)

            except Exception as error:
                logging.critical(
                    '%s Deletion CRITICAL of %s:%s as %s on %s: %s',
                    prepend_str, replica['scope'], replica['name'],
                    replica['pfn'], rse_name, str(traceback.format_exc()))
                deletion_dict['reason'] = str(error)
                add_message('deletion-failed', deletion_dict)

    except (ServiceUnavailable, RSEAccessDenied,
            ResourceTemporaryUnavailable) as error:
        for replica in replicas:
            logging.warning('%s Deletion NOACCESS of %s:%s as %s on %s: %s',
                            prepend_str, replica['scope'], replica['name'],
                            replica['pfn'], rse_name, str(error))
            add_message(
                'deletion-failed', {
                    'scope': replica['scope'].external,
                    'name': replica['name'],
                    'rse': rse_name,
                    'file-size': replica['bytes'],
                    'bytes': replica['bytes'],
                    'url': replica['pfn'],
                    'reason': str(error),
                    'protocol': prot.attributes['scheme']
                })

    finally:
        prot.close()
    return deleted_files
Ejemplo n.º 26
0
def submitter(once=False, rses=[], process=0, total_processes=1, thread=0, total_threads=1, mock=False, bulk=100, activities=None):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    logging.info('submitter starting - process (%i/%i) thread (%i/%i)' % (process,
                                                                          total_processes,
                                                                          thread,
                                                                          total_threads))

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = 'srm'

    logging.info('submitter started - process (%i/%i) thread (%i/%i)' % (process,
                                                                         total_processes,
                                                                         thread,
                                                                         total_threads))

    while not graceful_stop.is_set():

        try:

            if activities is None:
                activities = [None]
            for activity in activities:
                if rses is None:
                    rses = [None]

                for rse in rses:
                    if rse:
                        # run in rse list mode
                        rse_info = rsemgr.get_rse_info(rse['rse'])
                        logging.info("Working on RSE: %s" % rse['rse'])
                        ts = time.time()
                        reqs = get_requests(rse_id=rse['id'],
                                            process=process,
                                            total_processes=total_processes,
                                            thread=thread,
                                            total_threads=total_threads,
                                            mock=mock,
                                            bulk=bulk,
                                            activity=activity)
                        record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000)
                    else:
                        # no rse list, run FIFO mode
                        rse_info = None
                        ts = time.time()
                        reqs = get_requests(process=process,
                                            total_processes=total_processes,
                                            thread=thread,
                                            total_threads=total_threads,
                                            mock=mock,
                                            bulk=bulk,
                                            activity=activity)
                        record_timer('daemons.conveyor.submitter.get_requests', (time.time() - ts) * 1000)

                    if reqs:
                        logging.debug('%i:%i - submitting %i requests' % (process, thread, len(reqs)))

                    if not reqs or reqs == []:
                        time.sleep(1)
                        continue

                    for req in reqs:
                        try:
                            if not rse:
                                # no rse list, in FIFO mode
                                dest_rse = rse_core.get_rse(rse=None, rse_id=req['dest_rse_id'])
                                rse_info = rsemgr.get_rse_info(dest_rse['rse'])

                            ts = time.time()
                            transfer = get_transfer(rse_info, req, scheme, mock)
                            record_timer('daemons.conveyor.submitter.get_transfer', (time.time() - ts) * 1000)
                            logging.debug('Transfer for request %s: %s' % (req['request_id'], transfer))

                            if transfer is None:
                                logging.warn("Request %s DID %s:%s RSE %s failed to get transfer" % (req['request_id'],
                                                                                                     req['scope'],
                                                                                                     req['name'],
                                                                                                     rse_info['rse']))
                                # TODO: Merge these two calls
                                request.set_request_state(req['request_id'],
                                                          RequestState.LOST)  # if the DID does not exist anymore
                                request.archive_request(req['request_id'])
                                continue

                            ts = time.time()
                            tmp_metadata = transfer['file_metadata']
                            eids = request.submit_transfers(transfers=[transfer, ],
                                                            transfertool='fts3',
                                                            job_metadata=tmp_metadata)

                            record_timer('daemons.conveyor.submitter.submit_transfer', (time.time() - ts) * 1000)

                            ts = time.time()
                            if req['previous_attempt_id']:
                                logging.info('COPYING RETRY %s REQUEST %s PREVIOUS %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['retry_count'],
                                                                                                                                        req['request_id'],
                                                                                                                                        req['previous_attempt_id'],
                                                                                                                                        req['scope'],
                                                                                                                                        req['name'],
                                                                                                                                        transfer['src_urls'],
                                                                                                                                        transfer['dest_urls'],
                                                                                                                                        eids[req['request_id']]['external_host'],
                                                                                                                                        eids[req['request_id']]['external_id']))
                            else:
                                logging.info('COPYING REQUEST %s DID %s:%s FROM %s TO %s USING %s with eid: %s' % (req['request_id'],
                                                                                                                   req['scope'],
                                                                                                                   req['name'],
                                                                                                                   transfer['src_urls'],
                                                                                                                   transfer['dest_urls'],
                                                                                                                   eids[req['request_id']]['external_host'],
                                                                                                                   eids[req['request_id']]['external_id']))
                            record_counter('daemons.conveyor.submitter.submit_request')
                        except UnsupportedOperation, e:
                            # The replica doesn't exist, need to cancel the request
                            logging.warning(e)
                            logging.info('Cancelling transfer request %s' % req['request_id'])
                            try:
                                # TODO: for now, there is only ever one destination
                                request.cancel_request_did(req['scope'], req['name'], transfer['dest_urls'][0])
                            except Exception, e:
                                logging.warning('Cannot cancel request: %s' % str(e))
Ejemplo n.º 27
0
                    monitor.record_counter(counters="transmogrifier.did.dataset.processed", delta=1)
                elif did["did_type"] == str(DIDType.CONTAINER):
                    monitor.record_counter(counters="transmogrifier.did.container.processed", delta=1)
                monitor.record_counter(counters="transmogrifier.did.processed", delta=1)
                identifiers.append(
                    {"scope": did["scope"], "name": did["name"], "did_type": DIDType.from_sym(did["did_type"])}
                )
            time1 = time.time()
            for id in chunks(identifiers, 100):
                _retrial(set_new_dids, id, None)
            logging.info("Thread %i : Time to set the new flag : %f" % (worker_number, time.time() - time1))
            tottime = time.time() - start_time
            logging.info("Thread %i : It took %f seconds to process %i DIDs" % (worker_number, tottime, len(dids)))
            logging.debug("Thread %i : DIDs processed : %s" % (worker_number, str(dids)))
            monitor.record_counter(counters="transmogrifier.job.done", delta=1)
            monitor.record_timer(stat="transmogrifier.job.duration", time=1000 * tottime)
        except:
            exc_type, exc_value, exc_traceback = exc_info()
            logging.critical("".join(format_exception(exc_type, exc_value, exc_traceback)).strip())
            monitor.record_counter(counters="transmogrifier.job.error", delta=1)
            monitor.record_counter(counters="transmogrifier.addnewrule.error", delta=1)
        logging.info(once)
        if once is True:
            break
        if tottime < 10:
            time.sleep(10 - tottime)
    logging.info("Thread %i : Graceful stop requested" % (worker_number))
    logging.info("Thread %i : Graceful stop done" % (worker_number))


def run(total_workers=1, chunk_size=100, once=False):
Ejemplo n.º 28
0
def get_transfer(rse, req, scheme, mock):
    src_spacetoken = None

    ts = time.time()
    sources, metadata = get_sources(rse, scheme, req)
    record_timer('daemons.conveyor.submitter.get_sources', (time.time() - ts) * 1000)
    logging.debug('Sources for request %s: %s' % (req['request_id'], sources))
    if sources is None:
        logging.error("Request %s DID %s:%s RSE %s failed to get sources" % (req['request_id'],
                                                                             req['scope'],
                                                                             req['name'],
                                                                             rse['rse']))
        return None
    filesize = metadata['filesize']
    md5 = metadata['md5']
    adler32 = metadata['adler32']

    ts = time.time()
    destinations, dest_spacetoken = get_destinations(rse, scheme, req, sources)
    record_timer('daemons.conveyor.submitter.get_destinations', (time.time() - ts) * 1000)
    logging.debug('Destinations for request %s: %s' % (req['request_id'], destinations))
    if destinations is None:
        logging.error("Request %s DID %s:%s RSE %s failed to get destinations" % (req['request_id'],
                                                                                  req['scope'],
                                                                                  req['name'],
                                                                                  rse['rse']))
        return None

    # Come up with mock sources if necessary
    if mock:
        tmp_sources = []
        for s in sources:
            tmp_sources.append((s[0], ':'.join(['mock']+s[1].split(':')[1:])))
        sources = tmp_sources

    tmp_metadata = {'request_id': req['request_id'],
                    'scope': req['scope'],
                    'name': req['name'],
                    'activity': req['activity'],
                    'src_rse': sources[0][0],
                    'dst_rse': rse['rse'],
                    'dest_rse_id': req['dest_rse_id'],
                    'filesize': filesize,
                    'md5': md5,
                    'adler32': adler32}
    if 'previous_attempt_id' in req and req['previous_attempt_id']:
        tmp_metadata['previous_attempt_id'] = req['previous_attempt_id']

    # Extend the metadata dictionary with request attributes
    copy_pin_lifetime, overwrite, bring_online = -1, True, None
    if req['request_type'] == RequestType.STAGEIN:
        if req['attributes']:
            if type(req['attributes']) is dict:
                attr = json.loads(json.dumps(req['attributes']))
            else:
                attr = json.loads(str(req['attributes']))
            copy_pin_lifetime = attr.get('lifetime')
        overwrite = False
        bring_online = 21000

    # if the source for transfer is a tape rse, set bring_online
    if req['request_type'] == RequestType.TRANSFER\
       and rse_core.get_rse(sources[0][0]).rse_type == RSEType.TAPE:
        bring_online = 21000

    # never overwrite on tape destinations
    if req['request_type'] == RequestType.TRANSFER\
       and rse_core.get_rse(None, rse_id=req['dest_rse_id']).rse_type == RSEType.TAPE:
        overwrite = False

    # exclude destination replica from source
    source_surls = [s[1] for s in sources]
    if req['request_type'] == RequestType.STAGEIN and source_surls.sort() == destinations.sort():
        logging.debug('STAGING REQUEST %s - Will not try to ignore equivalent sources' % req['request_id'])
    elif req['request_type'] == RequestType.STAGEIN:
        logging.debug('STAGING REQUEST %s - Forcing destination to source' % req['request_id'])
        destinations = source_surls
    else:
        new_sources = source_surls
        for source_surl in source_surls:
            if source_surl in destinations:
                logging.info('Excluding source %s for request %s' % (source_surl,
                                                                     req['request_id']))
                new_sources.remove(source_surl)

        # make sure we only use one source when bring_online is needed
        if bring_online and len(new_sources) > 1:
            source_surls = [new_sources[0]]
            logging.info('Only using first source %s for bring_online request %s' % (source_surls,
                                                                                     req['request_id']))

    if not source_surls:
        logging.error('All sources excluded - SKIP REQUEST %s' % req['request_id'])
        return

    # Sources are properly set, so now we can finally force the source RSE to the destination RSE for STAGEIN
    if req['request_type'] == RequestType.STAGEIN:
        tmp_metadata['dst_rse'] = sources[0][0]

    # get external host
    if rse_core.get_rse(rse['rse'])['staging_area'] or rse['rse'].endswith("STAGING"):
        rse_attr = rse_core.list_rse_attributes(sources[0][0])
    else:
        rse_attr = rse_core.list_rse_attributes(rse['rse'], rse['id'])
    fts_hosts = rse_attr.get('fts', None)
    retry_count = req['retry_count']
    if not retry_count:
        retry_count = 0
    if not fts_hosts:
        logging.error('Destination RSE %s FTS attribute not defined - SKIP REQUEST %s' % (rse['rse'], req['request_id']))
        return

    fts_list = fts_hosts.split(",")
    external_host = fts_list[retry_count/len(fts_list)]

    transfer = {'request_id': req['request_id'],
                'src_urls': source_surls,
                'dest_urls': destinations,
                'filesize': filesize,
                'md5': md5,
                'adler32': adler32,
                'src_spacetoken': src_spacetoken,
                'dest_spacetoken': dest_spacetoken,
                'activity': req['activity'],
                'overwrite': overwrite,
                'bring_online': bring_online,
                'copy_pin_lifetime': copy_pin_lifetime,
                'external_host': external_host,
                'file_metadata': tmp_metadata}
    return transfer
Ejemplo n.º 29
0
def poller(once=False, process=0, total_processes=1, thread=0, total_threads=1, bulk=1000, older_than=60):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    logging.info('poller starting - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                 thread, total_threads,
                                                                                 bulk))

    logging.info('poller started - process (%i/%i) thread (%i/%i) bulk (%i)' % (process, total_processes,
                                                                                thread, total_threads,
                                                                                bulk))

    while not graceful_stop.is_set():

        try:
            ts = time.time()

            logging.debug('%i:%i - start to poll requests older than %i seconds' % (process, thread, older_than))
            reqs = request.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                    state=RequestState.SUBMITTED,
                                    limit=bulk,
                                    older_than=datetime.datetime.utcnow()-datetime.timedelta(seconds=older_than),
                                    process=process, total_processes=total_processes,
                                    thread=thread, total_threads=total_threads)
            record_timer('daemons.conveyor.poller.000-get_next', (time.time()-ts)*1000)

            if reqs:
                logging.debug('%i:%i - polling %i requests' % (process, thread, len(reqs)))

            if not reqs or reqs == []:
                if once:
                    break
                logging.debug("no requests found. will sleep 60 seconds")
                time.sleep(60)  # Only sleep if there is nothing to do
                continue

            for xfers in chunks(reqs, bulk):
                try:
                    req_ids = {}
                    for req in xfers:
                        record_counter('daemons.conveyor.poller.query_request')
                        if not req['external_host'] in req_ids:
                            req_ids[req['external_host']] = []
                        req_ids[req['external_host']].append((req['request_id'], req['external_id']))

                    responses = {}
                    for external_host in req_ids:
                        ts = time.time()
                        resps = request.bulk_query_requests(external_host, req_ids[external_host], 'fts3')
                        record_timer('daemons.conveyor.poller.001-bulk_query_requests', (time.time()-ts)*1000/len(req_ids[external_host]))
                        responses = dict(responses.items() + resps.items())

                    for external_host in req_ids:
                        for request_id, external_id in req_ids[external_host]:
                            response = responses[request_id]
                            if isinstance(response, Exception):
                                logging.critical("Failed to poll request(%s) with FTS(%s) job (%s): %s" % (request_id, external_host, external_id, responses[request_id]))
                                record_counter('daemons.conveyor.poller.query_request_exception')
                                response = {'new_state': None, 'request_id': request_id, 'transfer_id': external_id, 'job_state': None}
                            common.update_request_state(response)
                            record_counter('daemons.conveyor.poller.update_request_state')
                            if response['new_state'] == RequestState.LOST:
                                record_counter('daemons.conveyor.poller.request_lost')
                except:
                    logging.critical(traceback.format_exc())

        except:
            logging.critical(traceback.format_exc())

        if once:
            return

    logging.debug('%i:%i - graceful stop requests' % (process, thread))

    logging.debug('%i:%i - graceful stop done' % (process, thread))
Ejemplo n.º 30
0
def poll_transfers(external_host, xfers, prepend_str='', request_ids=None, timeout=None):
    """
    Poll a list of transfers from an FTS server

    :param external_host:    The FTS server to query from.
    :param xfrs:             List of transfers to poll.
    :param process:          Process number.
    :param thread:           Thread number.
    :param timeout:          Timeout.
    """
    try:
        if TRANSFER_TOOL == 'mock':
            logging.debug(prepend_str + 'Setting %s transfer requests status to DONE per mock tool' % (len(xfers)))
            for task_id in xfers:
                ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=RequestState.DONE)
                record_counter('daemons.conveyor.poller.update_request_state.%s' % ret)
            return
        try:
            tss = time.time()
            logging.info(prepend_str + 'Polling %i transfers against %s with timeout %s' % (len(xfers), external_host, timeout))
            resps = transfer_core.bulk_query_transfers(external_host, xfers, TRANSFER_TOOL, timeout)
            record_timer('daemons.conveyor.poller.bulk_query_transfers', (time.time() - tss) * 1000 / len(xfers))
        except TransferToolTimeout as error:
            logging.error(prepend_str + str(error))
            return
        except TransferToolWrongAnswer as error:
            logging.error(prepend_str + str(error))
            logging.error(prepend_str + 'Problem querying %s on %s. All jobs are being checked individually' % (str(xfers), external_host))
            for xfer in xfers:
                try:
                    logging.debug(prepend_str + 'Checking %s on %s' % (xfer, external_host))
                    status = transfer_core.bulk_query_transfers(external_host, [xfer, ], TRANSFER_TOOL, timeout)
                    if xfer in status and isinstance(status[xfer], Exception):
                        logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(status[xfer])))
                except Exception as err:
                    logging.error(prepend_str + 'Problem querying %s on %s . Error returned : %s' % (xfer, external_host, str(err)))
                    break
            return
        except RequestException as error:
            logging.error(prepend_str + "Failed to contact FTS server: %s" % (str(error)))
            return
        except Exception:
            logging.error(prepend_str + "Failed to query FTS info: %s" % (traceback.format_exc()))
            return

        logging.debug(prepend_str + 'Polled %s transfer requests status in %s seconds' % (len(xfers), (time.time() - tss)))
        tss = time.time()
        logging.debug(prepend_str + 'Updating %s transfer requests status' % (len(xfers)))
        cnt = 0

        if TRANSFER_TOOL == 'globus':
            for task_id in resps:
                ret = transfer_core.update_transfer_state(external_host=None, transfer_id=task_id, state=resps[task_id])
                record_counter('daemons.conveyor.poller.update_request_state.%s' % ret)
        else:
            for transfer_id in resps:
                try:
                    transf_resp = resps[transfer_id]
                    # transf_resp is None: Lost.
                    #             is Exception: Failed to get fts job status.
                    #             is {}: No terminated jobs.
                    #             is {request_id: {file_status}}: terminated jobs.
                    if transf_resp is None:
                        transfer_core.update_transfer_state(external_host, transfer_id, RequestState.LOST, logging_prepend_str=prepend_str)
                        record_counter('daemons.conveyor.poller.transfer_lost')
                    elif isinstance(transf_resp, Exception):
                        logging.warning(prepend_str + "Failed to poll FTS(%s) job (%s): %s" % (external_host, transfer_id, transf_resp))
                        record_counter('daemons.conveyor.poller.query_transfer_exception')
                    else:
                        for request_id in transf_resp:
                            if request_id in request_ids:
                                ret = request_core.update_request_state(transf_resp[request_id], logging_prepend_str=prepend_str)
                                # if True, really update request content; if False, only touch request
                                if ret:
                                    cnt += 1
                                record_counter('daemons.conveyor.poller.update_request_state.%s' % ret)

                    # should touch transfers.
                    # Otherwise if one bulk transfer includes many requests and one is not terminated, the transfer will be poll again.
                    transfer_core.touch_transfer(external_host, transfer_id)
                except (DatabaseException, DatabaseError) as error:
                    if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                        logging.warn(prepend_str + "Lock detected when handling request %s - skipping" % request_id)
                    else:
                        logging.error(traceback.format_exc())
            logging.debug(prepend_str + 'Finished updating %s transfer requests status (%i requests state changed) in %s seconds' % (len(xfers), cnt, (time.time() - tss)))
    except Exception:
        logging.error(traceback.format_exc())
Ejemplo n.º 31
0
def submitter(once=False,
              rses=[],
              mock=False,
              process=0,
              total_processes=1,
              total_threads=1,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              fts_source_strategy='auto',
              activities=None,
              sleep_time=600,
              max_sources=4,
              retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' %
                 (process, total_processes, total_threads))

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info(
        'Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)'
        % (process, total_processes, hb['assign_thread'], hb['nr_threads'],
           timeout))

    threadPool = ThreadPool(total_threads)
    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info("CMS user transfer activity")
                    user_transfer = True

                logging.info(
                    "%s:%s Starting to get transfer transfers for %s" %
                    (process, hb['assign_thread'], activity))
                ts = time.time()
                transfers = __get_transfers(process=process,
                                            total_processes=total_processes,
                                            thread=hb['assign_thread'],
                                            total_threads=hb['nr_threads'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            mock=mock,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - ts) * 1000 /
                    (len(transfers) if len(transfers) else 1))
                record_counter(
                    'daemons.conveyor.transfer_submitter.get_transfers',
                    len(transfers))
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    len(transfers))
                logging.info(
                    "%s:%s Got %s transfers for %s" %
                    (process, hb['assign_thread'], len(transfers), activity))

                # group transfers
                logging.info("%s:%s Starting to group transfers for %s" %
                             (process, hb['assign_thread'], activity))
                ts = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk,
                                                   fts_source_strategy,
                                                   max_time_in_queue)
                record_timer(
                    'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                    (time.time() - ts) * 1000 /
                    (len(transfers) if len(transfers) else 1))

                logging.info("%s:%s Starting to submit transfers for %s" %
                             (process, hb['assign_thread'], activity))

                for external_host in grouped_jobs:
                    if not user_transfer:
                        for job in grouped_jobs[external_host]:
                            # submit transfers
                            job_requests = makeRequests(
                                submit_transfer,
                                args_list=[((), {
                                    'external_host': external_host,
                                    'job': job,
                                    'submitter': 'transfer_submitter',
                                    'process': process,
                                    'thread': hb['assign_thread'],
                                    'timeout': timeout
                                })])
                            [
                                threadPool.putRequest(job_req)
                                for job_req in job_requests
                            ]
                    else:
                        for user, jobs in grouped_jobs[
                                external_host].iteritems():
                            # submit transfers
                            for job in jobs:
                                job_requests = makeRequests(
                                    submit_transfer,
                                    args_list=[((), {
                                        'external_host':
                                        external_host,
                                        'job':
                                        job,
                                        'submitter':
                                        'transfer_submitter',
                                        'process':
                                        process,
                                        'thread':
                                        hb['assign_thread'],
                                        'timeout':
                                        timeout,
                                        'user_transfer_job':
                                        user_transfer
                                    })])
                                [
                                    threadPool.putRequest(job_req)
                                    for job_req in job_requests
                                ]
                threadPool.wait()

                if len(transfers) < group_bulk:
                    logging.info(
                        '%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (process, hb['assign_thread'], len(transfers),
                           activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except:
            logging.critical(
                '%s:%s %s' %
                (process, hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('%s:%s graceful stop requested' %
                 (process, hb['assign_thread']))

    threadPool.dismissWorkers(total_threads, do_join=True)
    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s:%s graceful stop done' % (process, hb['assign_thread']))
    return
Ejemplo n.º 32
0
def submit_transfer(external_host, job, submitter='submitter', timeout=None, user_transfer_job=False, logger=logging.log, transfertool=TRANSFER_TOOL):
    """
    Submit a transfer or staging request

    :param external_host:         FTS server to submit to.
    :param job:                   Job dictionary.
    :param submitter:             Name of the submitting entity.
    :param timeout:               Timeout
    :param user_transfer_job:     Parameter for transfer with user credentials
    :param logger:                Optional decorated logger that can be passed from the calling daemons or servers.
    """

    # Prepare submitting
    xfers_ret = {}

    try:
        for t_file in job['files']:
            file_metadata = t_file['metadata']
            request_id = file_metadata['request_id']
            log_str = 'PREPARING REQUEST %s DID %s:%s TO SUBMITTING STATE PREVIOUS %s FROM %s TO %s USING %s ' % (file_metadata['request_id'],
                                                                                                                  file_metadata['scope'],
                                                                                                                  file_metadata['name'],
                                                                                                                  file_metadata['previous_attempt_id'] if 'previous_attempt_id' in file_metadata else None,
                                                                                                                  t_file['sources'],
                                                                                                                  t_file['destinations'],
                                                                                                                  external_host)
            xfers_ret[request_id] = {'state': RequestState.SUBMITTING, 'external_host': external_host, 'external_id': None, 'dest_url': t_file['destinations'][0]}
            logger(logging.INFO, "%s", log_str)
            xfers_ret[request_id]['file'] = t_file
        logger(logging.DEBUG, 'Start to prepare transfer')
        transfer_core.prepare_sources_for_transfers(xfers_ret)
        logger(logging.DEBUG, 'Finished to prepare transfer')
    except RequestNotFound as error:
        logger(logging.ERROR, str(error))
        return
    except Exception:
        logger(logging.ERROR, 'Failed to prepare requests %s state to SUBMITTING (Will not submit jobs but return directly) with error: %s' % (list(xfers_ret.keys())), exc_info=True)
        return

    # Prepare the dictionary for xfers results
    xfers_ret = {}
    for t_file in job['files']:
        file_metadata = t_file['metadata']
        request_id = file_metadata['request_id']
        xfers_ret[request_id] = {'scope': file_metadata['scope'],
                                 'name': file_metadata['name'],
                                 'external_host': external_host,
                                 'external_id': None,
                                 'request_type': t_file.get('request_type', None),
                                 'dst_rse': file_metadata.get('dst_rse', None),
                                 'src_rse': file_metadata.get('src_rse', None),
                                 'src_rse_id': file_metadata['src_rse_id'],
                                 'metadata': file_metadata}

    # Submit the job
    try:
        start_time = time.time()
        logger(logging.INFO, 'About to submit job to %s with timeout %s' % (external_host, timeout))
        # A eid is returned if the job is properly submitted otherwise an exception is raised
        eid = transfer_core.submit_bulk_transfers(external_host,
                                                  files=job['files'],
                                                  transfertool=transfertool,
                                                  job_params=job['job_params'],
                                                  timeout=timeout,
                                                  user_transfer_job=user_transfer_job)
        duration = time.time() - start_time
        logger(logging.INFO, 'Submit job %s to %s in %s seconds' % (eid, external_host, duration))
        record_timer('daemons.conveyor.%s.submit_bulk_transfer.per_file' % submitter, (time.time() - start_time) * 1000 / len(job['files']))
        record_counter('daemons.conveyor.%s.submit_bulk_transfer' % submitter, len(job['files']))
        record_timer('daemons.conveyor.%s.submit_bulk_transfer.files' % submitter, len(job['files']))

        # Update all the requests to SUBMITTED and cancel the transfer job in case the update failed
        try:
            for t_file in job['files']:
                file_metadata = t_file['metadata']
                request_id = file_metadata['request_id']
                xfers_ret[request_id]['state'] = RequestState.SUBMITTED
                xfers_ret[request_id]['external_id'] = eid
                logger(logging.INFO, 'COPYING REQUEST %s DID %s:%s USING %s with state(%s) with eid(%s)' % (file_metadata['request_id'], file_metadata['scope'], file_metadata['name'], external_host, RequestState.SUBMITTED, eid))
            logger(logging.DEBUG, 'Start to bulk register transfer state for eid %s' % eid)
            transfer_core.set_transfers_state(xfers_ret, datetime.datetime.utcnow())
            logger(logging.DEBUG, 'Finished to register transfer state',)
        except Exception:
            logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True)
            try:
                logger(logging.INFO, 'Cancel transfer %s on %s', eid, external_host)
                request.cancel_request_external_id(eid, external_host)
            except Exception:
                # The job is still submitted in the file transfer service but the request is not updated. Possibility to have a double submission during the next cycle
                logger(logging.ERROR, 'Failed to cancel transfers %s on %s with error' % (eid, external_host), exc_info=True)

    # This exception is raised if one job is already submitted for one file
    except DuplicateFileTransferSubmission as error:
        logger(logging.WARNING, 'Failed to submit a job because of duplicate file : %s', str(error))
        logger(logging.INFO, 'Submitting files one by one')

        try:
            # In this loop we submit the jobs and update the requests state one by one
            single_xfers_ret = {}
            for t_file in job['files']:
                single_xfers_ret = {}
                single_xfers_ret[request_id] = xfers_ret[request_id]
                file_metadata = t_file['metadata']
                request_id = file_metadata['request_id']
                start_time = time.time()
                logger(logging.INFO, 'About to submit job to %s with timeout %s', external_host, timeout)
                eid = transfer_core.submit_bulk_transfers(external_host,
                                                          files=[t_file],
                                                          transfertool=transfertool,
                                                          job_params=job['job_params'],
                                                          timeout=timeout,
                                                          user_transfer_job=user_transfer_job)
                duration = time.time() - start_time
                logger(logging.INFO, 'Submit job %s to %s in %s seconds' % (eid, external_host, duration))
                record_timer('daemons.conveyor.%s.submit_bulk_transfer.per_file' % submitter, (time.time() - start_time) * 1000)
                record_counter('daemons.conveyor.%s.submit_bulk_transfer' % submitter, 1)
                record_timer('daemons.conveyor.%s.submit_bulk_transfer.files' % submitter, 1)
                single_xfers_ret[request_id]['state'] = RequestState.SUBMITTED
                single_xfers_ret[request_id]['external_id'] = eid
                logger(logging.INFO, 'COPYING REQUEST %s DID %s:%s USING %s with state(%s) with eid(%s)' % (file_metadata['request_id'], file_metadata['scope'], file_metadata['name'], external_host, RequestState.SUBMITTED, eid))
                try:
                    logger(logging.DEBUG, 'Start to register transfer state')
                    transfer_core.set_transfers_state(single_xfers_ret, datetime.datetime.utcnow())
                    logger(logging.DEBUG, 'Finished to register transfer state')
                except Exception:
                    logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True)
                    try:
                        logger(logging.INFO, 'Cancel transfer %s on %s', eid, external_host)
                        request.cancel_request_external_id(eid, external_host)
                    except Exception:
                        logger(logging.ERROR, 'Failed to cancel transfers %s on %s with error' % (eid, external_host), exc_info=True)

        except (DuplicateFileTransferSubmission, TransferToolTimeout, TransferToolWrongAnswer, Exception) as error:
            request_id = single_xfers_ret.keys()[0]
            single_xfers_ret[request_id]['state'] = RequestState.SUBMISSION_FAILED
            logger(logging.ERROR, 'Cannot submit the job for request %s : %s', request_id, str(error))
            try:
                logger(logging.DEBUG, 'Update the transfer state to fail')
                transfer_core.set_transfers_state(single_xfers_ret, datetime.datetime.utcnow())
                logger(logging.DEBUG, 'Finished to register transfer state')
            except Exception:
                logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True)
                # No need to cancel the job here

    # The following exceptions are raised if the job failed to be submitted
    except (TransferToolTimeout, TransferToolWrongAnswer) as error:
        logger(logging.ERROR, str(error))
        try:
            for t_file in job['files']:
                file_metadata = t_file['metadata']
                request_id = file_metadata['request_id']
                xfers_ret[request_id]['state'] = RequestState.SUBMISSION_FAILED
            logger(logging.DEBUG, 'Start to register transfer state')
            transfer_core.set_transfers_state(xfers_ret, datetime.datetime.utcnow())
            logger(logging.DEBUG, 'Finished to register transfer state')
        except Exception:
            logger(logging.ERROR, 'Failed to register transfer state with error', exc_info=True)
            # No need to cancel the job here

    except Exception as error:
        logger(logging.ERROR, 'Failed to submit a job with error %s', str(error), exc_info=True)
Ejemplo n.º 33
0
def handle_requests(reqs, suspicious_patterns):
    """
    used by finisher to handle terminated requests,

    :param reqs: List of requests.
    """

    undeterministic_rses = get_undeterministic_rses()
    rses_info, protocols = {}, {}
    replicas = {}
    for req in reqs:
        try:
            replica = {'scope': req['scope'], 'name': req['name'], 'rse_id': req['dest_rse_id'], 'bytes': req['bytes'], 'adler32': req['adler32'], 'request_id': req['request_id']}

            replica['pfn'] = req['dest_url']
            replica['request_type'] = req['request_type']
            replica['error_message'] = None

            if req['request_type'] not in replicas:
                replicas[req['request_type']] = {}
            if req['rule_id'] not in replicas[req['request_type']]:
                replicas[req['request_type']][req['rule_id']] = []

            if req['state'] == RequestState.DONE:
                replica['state'] = ReplicaState.AVAILABLE
                replica['archived'] = False

                # for TAPE, replica path is needed
                if req['request_type'] == RequestType.TRANSFER and req['dest_rse_id'] in undeterministic_rses:
                    if req['dest_rse_id'] not in rses_info:
                        dest_rse = rse_core.get_rse_name(rse_id=req['dest_rse_id'])
                        rses_info[req['dest_rse_id']] = rsemanager.get_rse_info(dest_rse)
                    pfn = req['dest_url']
                    scheme = urlparse(pfn).scheme
                    dest_rse_id_scheme = '%s_%s' % (req['dest_rse_id'], scheme)
                    if dest_rse_id_scheme not in protocols:
                        protocols[dest_rse_id_scheme] = rsemanager.create_protocol(rses_info[req['dest_rse_id']], 'write', scheme)
                    path = protocols[dest_rse_id_scheme].parse_pfns([pfn])[pfn]['path']
                    replica['path'] = os.path.join(path, os.path.basename(pfn))

                # replica should not be added to replicas until all info are filled
                replicas[req['request_type']][req['rule_id']].append(replica)

            elif req['state'] == RequestState.FAILED:
                check_suspicious_files(req, suspicious_patterns)
                if request_core.should_retry_request(req):
                    tss = time.time()
                    new_req = request_core.requeue_and_archive(req['request_id'])
                    record_timer('daemons.conveyor.common.update_request_state.request-requeue_and_archive', (time.time() - tss) * 1000)
                    if new_req:
                        # should_retry_request and requeue_and_archive are not in one session,
                        # another process can requeue_and_archive and this one will return None.
                        logging.warn('REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'],
                                                                                     req['name'],
                                                                                     req['request_id'],
                                                                                     new_req['request_id'],
                                                                                     new_req['retry_count']))
                else:
                    logging.warn('EXCEEDED DID %s:%s REQUEST %s' % (req['scope'], req['name'], req['request_id']))
                    replica['state'] = ReplicaState.UNAVAILABLE
                    replica['archived'] = False
                    replica['error_message'] = req['err_msg'] if req['err_msg'] else get_transfer_error(req['state'])
                    replicas[req['request_type']][req['rule_id']].append(replica)
            elif req['state'] == RequestState.SUBMITTING or req['state'] == RequestState.SUBMISSION_FAILED or req['state'] == RequestState.LOST:
                if req['updated_at'] > (datetime.datetime.utcnow() - datetime.timedelta(minutes=120)):
                    continue

                if request_core.should_retry_request(req):
                    tss = time.time()
                    new_req = request_core.requeue_and_archive(req['request_id'])
                    record_timer('daemons.conveyor.common.update_request_state.request-requeue_and_archive', (time.time() - tss) * 1000)
                    if new_req:
                        logging.warn('REQUEUED SUBMITTING DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'],
                                                                                                req['name'],
                                                                                                req['request_id'],
                                                                                                new_req['request_id'],
                                                                                                new_req['retry_count']))
                else:
                    logging.warn('EXCEEDED SUBMITTING DID %s:%s REQUEST %s' % (req['scope'], req['name'], req['request_id']))
                    replica['state'] = ReplicaState.UNAVAILABLE
                    replica['archived'] = False
                    replica['error_message'] = req['err_msg'] if req['err_msg'] else get_transfer_error(req['state'])
                    replicas[req['request_type']][req['rule_id']].append(replica)
            elif req['state'] == RequestState.NO_SOURCES or req['state'] == RequestState.ONLY_TAPE_SOURCES or req['state'] == RequestState.MISMATCH_SCHEME:
                if request_core.should_retry_request(req):
                    tss = time.time()
                    new_req = request_core.requeue_and_archive(req['request_id'])
                    record_timer('daemons.conveyor.common.update_request_state.request-requeue_and_archive', (time.time() - tss) * 1000)
                    if new_req:
                        # should_retry_request and requeue_and_archive are not in one session,
                        # another process can requeue_and_archive and this one will return None.
                        logging.warn('REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' % (req['scope'],
                                                                                     req['name'],
                                                                                     req['request_id'],
                                                                                     new_req['request_id'],
                                                                                     new_req['retry_count']))
                else:
                    logging.warn('EXCEEDED DID %s:%s REQUEST %s' % (req['scope'], req['name'], req['request_id']))
                    replica['state'] = ReplicaState.UNAVAILABLE  # should be broken here
                    replica['archived'] = False
                    replica['error_message'] = req['err_msg'] if req['err_msg'] else get_transfer_error(req['state'])
                    replicas[req['request_type']][req['rule_id']].append(replica)

        except:
            logging.error("Something unexpected happened when handling request %s(%s:%s) at %s: %s" % (req['request_id'],
                                                                                                       req['scope'],
                                                                                                       req['name'],
                                                                                                       req['dest_rse_id'],
                                                                                                       traceback.format_exc()))

    handle_terminated_replicas(replicas)
Ejemplo n.º 34
0
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, once=False, dataset_lifetime=None):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)
    while not graceful_stop.is_set():
        starttime = time()
        logging.info('Thread [%i/%i] : Getting data distribution' % (worker_number, total_workers))
        probabilities, data = get_data_distribution(inputfile)
        logging.debug('Thread [%i/%i] : Probabilities %s' % (worker_number, total_workers, probabilities))
        account = 'root'
        scope = 'tests'
        now = datetime.now()
        dsn_extension = '%s.%s.%s.%s' % (now.year, now.month, now.day, generate_uuid())
        totretries = 3
        status = False
        for site in sites:
            for retry in xrange(0, totretries):
                ts = time()
                tmpdir = tempfile.mkdtemp()
                logging.info('Thread [%i/%i] : Running on site %s' % (worker_number, total_workers, site))
                d = choose_element(probabilities, data)
                metadata = d['metadata']
                metadata['version'] = str(random.randint(0, 1000))
                metadata['run_number'] = str(random.randint(0, 100000))
                uuid = generate_uuid()
                metadata['stream_name'] = 'automatix_stream'
                metadata['campaign'] = 'automatix_campaign'
                try:
                    nbfiles = d['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logging.warning('Thread [%i/%i] : No nbfiles defined in the configuration, will use 2' % (worker_number, total_workers))
                try:
                    filesize = d['filesize']
                except KeyError:
                    filesize = 1000000
                    logging.warning('Thread [%i/%i] : No filesize defined in the configuration, will use 1M files' % (worker_number, total_workers))
                dsn = 'tests:%s.%s.%s.%s.%s.%s' % (metadata['project'], metadata['run_number'], metadata['stream_name'], metadata['prod_step'], metadata['datatype'], metadata['version'])
                fnames = []
                lfns = []
                for nb in xrange(nbfiles):
                    fname = '%s.%s' % (metadata['datatype'], generate_uuid())
                    lfns.append(fname)
                    fname = '%s/%s' % (tmpdir, fname)
                    logging.info('Thread [%(worker_number)s/%(total_workers)s] : Generating file %(fname)s in dataset %(dsn)s' % locals())
                    generate_file(fname, filesize)
                    fnames.append(fname)
                logging.info('Thread [%i/%i] : Upload %s to %s' % (worker_number, total_workers, dsn, site))
                status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn)
                for fname in fnames:
                    remove(fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(counters='automatix.addnewdataset.done',  delta=1)
                    monitor.record_counter(counters='automatix.addnewfile.done',  delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection', (time() - ts) * 1000)
                    break
                else:
                    logging.info('Thread [%i/%i] : Failed to upload files. Will retry another time (attempt %s/%s)' % (worker_number, total_workers, str(retry + 1), str(totretries)))
        if once is True:
            logging.info('Thread [%i/%i] : Run with once mode. Exiting' % (worker_number, total_workers))
            break
        tottime = time() - starttime
        if status:
            logging.info('Thread [%i/%i] : It took %s seconds to upload one dataset on %s' % (worker_number, total_workers, str(tottime), str(sites)))
            if sleep_time > tottime:
                logging.info('Thread [%i/%i] : Will sleep for %s seconds' % (worker_number, total_workers, str(sleep_time - tottime)))
                sleep(sleep_time - tottime)
        else:
            logging.info('Thread [%i/%i] : Retrying a new upload' % (worker_number, total_workers))
Ejemplo n.º 35
0
def automatix(sites,
              inputfile,
              sleep_time,
              account,
              worker_number=1,
              total_workers=1,
              scope='tests',
              once=False,
              dataset_lifetime=None,
              set_metadata=False):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)

    executable = 'automatix'
    hostname = socket.getfqdn()
    pid = getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers)
    logger = formatted_logger(logging.log, prefix + '%s')
    while not GRACEFUL_STOP.is_set():
        heartbeat.live(executable, hostname, pid, hb_thread)
        starttime = time()
        prefix = 'automatix[%i/%i] : ' % (worker_number, total_workers)
        logger = formatted_logger(logging.log, prefix + '%s')
        logger(logging.INFO, 'Getting data distribution')
        probabilities, data = get_data_distribution(inputfile)
        logger(logging.DEBUG, 'Probabilities %s', probabilities)
        totretries = 3
        status = False

        for site in sites:

            for retry in range(0, totretries):
                start_time = time()
                tmpdir = tempfile.mkdtemp()
                logger(logging.INFO, 'Running on site %s', site)
                dic = choose_element(probabilities, data)
                metadata = dic['metadata']
                try:
                    nbfiles = dic['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logger(
                        logging.WARNING,
                        'No nbfiles defined in the configuration, will use 2')
                try:
                    filesize = dic['filesize']
                except KeyError:
                    filesize = 1000000
                    logger(
                        logging.WARNING,
                        'No filesize defined in the configuration, will use 1M files'
                    )
                dsn = generate_didname(metadata, None, 'dataset')
                fnames = []
                lfns = []
                physical_fnames = []
                for _ in range(nbfiles):
                    fname = generate_didname(metadata=metadata,
                                             dsn=dsn,
                                             did_type='file')
                    lfns.append(fname)
                    logger(logging.INFO, 'Generating file %s in dataset %s',
                           fname, dsn)
                    physical_fname = '%s/%s' % (tmpdir, "".join(
                        fname.split('/')))
                    physical_fnames.append(physical_fname)
                    generate_file(physical_fname, filesize, logger=logger)
                    fnames.append(fname)
                logger(logging.INFO, 'Upload %s to %s', dsn, site)
                dsn = '%s:%s' % (scope, dsn)
                status = upload(files=lfns,
                                scope=scope,
                                metadata=metadata,
                                rse=site,
                                account=account,
                                source_dir=tmpdir,
                                dataset_lifetime=dataset_lifetime,
                                did=dsn,
                                set_metadata=set_metadata,
                                logger=logger)
                for physical_fname in physical_fnames:
                    remove(physical_fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(
                        counters='automatix.addnewdataset.done', delta=1)
                    monitor.record_counter(
                        counters='automatix.addnewfile.done', delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection',
                                         (time() - start_time) * 1000)
                    break
                else:
                    logger(
                        logging.INFO,
                        'Failed to upload files. Will retry another time (attempt %s/%s)',
                        str(retry + 1), str(totretries))
        if once is True:
            logger(logging.INFO, 'Run with once mode. Exiting')
            break
        tottime = time() - starttime
        if status:
            logger(logging.INFO,
                   'It took %s seconds to upload one dataset on %s',
                   str(tottime), str(sites))
            if sleep_time > tottime:
                logger(logging.INFO, 'Will sleep for %s seconds',
                       str(sleep_time - tottime))
                sleep(sleep_time - tottime)
        else:
            logger(logging.INFO, 'Retrying a new upload')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logger(logging.INFO, 'Graceful stop requested')
    logger(logging.INFO, 'Graceful stop done')
Ejemplo n.º 36
0
def get_destinations(rse_info, scheme, req, sources):
    dsn = 'other'
    pfn = {}
    paths = {}
    if not rse_info['deterministic']:
        ts = time.time()

        # get rule scope and name
        if req['attributes']:
            if type(req['attributes']) is dict:
                req_attributes = json.loads(json.dumps(req['attributes']))
            else:
                req_attributes = json.loads(str(req['attributes']))
            if 'ds_name' in req_attributes:
                dsn = req_attributes["ds_name"]
        if dsn == 'other':
            # select a containing dataset
            for parent in did.list_parent_dids(req['scope'], req['name']):
                if parent['type'] == DIDType.DATASET:
                    dsn = parent['name']
                    break
        record_timer('daemons.conveyor.submitter.list_parent_dids', (time.time() - ts) * 1000)

        # always use SRM
        ts = time.time()
        nondet = rsemgr.create_protocol(rse_info, 'write', scheme='srm')
        record_timer('daemons.conveyor.submitter.create_protocol', (time.time() - ts) * 1000)

        # if there exists a prefix for SRM, use it
        prefix = ''
        for s in rse_info['protocols']:
            if s['scheme'] == 'srm':
                prefix = s['prefix']

        # DQ2 path always starts with /, but prefix might not end with /
        path = construct_surl_DQ2(dsn, req['name'])

        # retrial transfers to tape need a new filename - add timestamp
        if req['request_type'] == RequestType.TRANSFER\
           and 'previous_attempt_id' in req\
           and req['previous_attempt_id']\
           and rse_info['rse_type'] == 'TAPE':  # TODO: RUCIO-809 - rsemanager: get_rse_info -> rse_type is string instead of RSEType
            path = '%s_%i' % (path, int(time.time()))
            logging.debug('Retrial transfer request %s DID %s:%s to tape %s renamed to %s' % (req['request_id'],
                                                                                              req['scope'],
                                                                                              req['name'],
                                                                                              rse_info['rse'],
                                                                                              path))

        tmp_path = '%s%s' % (prefix[:-1], path)
        if prefix[-1] != '/':
            tmp_path = '%s%s' % (prefix, path)
        paths[req['scope'], req['name']] = path

        # add the hostname
        pfn['%s:%s' % (req['scope'], req['name'])] = nondet.path2pfn(tmp_path)
        if req['request_type'] == RequestType.STAGEIN:
            if len(sources) == 1:
                pfn['%s:%s' % (req['scope'], req['name'])] = sources[0][1]
            else:
                # TODO: need to check
                return None, None

        # we must set the destination path for nondeterministic replicas explicitly
        replica.update_replicas_paths([{'scope': req['scope'],
                                        'name': req['name'],
                                        'rse_id': req['dest_rse_id'],
                                        'path': path}])

    else:
        ts = time.time()
        try:
            pfn = rsemgr.lfns2pfns(rse_info,
                                   lfns=[{'scope': req['scope'],
                                          'name': req['name']}],
                                   scheme=scheme)
        except RSEProtocolNotSupported:
            logging.warn('%s not supported by %s' % (scheme, rse_info['rse']))
            return None, None

        record_timer('daemons.conveyor.submitter.lfns2pfns', (time.time() - ts) * 1000)

    destinations = []
    for k in pfn:
        if isinstance(pfn[k], (str, unicode)):
            destinations.append(pfn[k])
        elif isinstance(pfn[k], (tuple, list)):
            for url in pfn[k]:
                destinations.append(pfn[k][url])

    protocols = None
    try:
        protocols = rsemgr.select_protocol(rse_info, 'write', scheme=scheme)
    except RSEProtocolNotSupported:
        logging.warn('%s not supported by %s' % (scheme, rse_info['rse']))
        return None, None

    # we need to set the spacetoken if we use SRM
    dest_spacetoken = None
    if scheme == 'srm':
        dest_spacetoken = protocols['extended_attributes']['space_token']

    return destinations, dest_spacetoken
Ejemplo n.º 37
0
def submitter(once=False,
              rses=None,
              mock=False,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              source_strategy=None,
              activities=None,
              sleep_time=600,
              max_sources=4,
              retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s",
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = sys.argv[0]
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info('%s Submitter starting with timeout %s', prepend_str, timeout)

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info('%s Transfer submitter started', prepend_str)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info('%s CMS user transfer activity', prepend_str)
                    user_transfer = True

                logging.info('%s Starting to get transfer transfers for %s',
                             prepend_str, activity)
                start_time = time.time()
                transfers = __get_transfers(
                    total_workers=heart_beat['nr_threads'] - 1,
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    schemes=scheme,
                    mock=mock,
                    max_sources=max_sources,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter(
                    'daemons.conveyor.transfer_submitter.get_transfers',
                    len(transfers))
                GET_TRANSFERS_COUNTER.inc(len(transfers))
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    len(transfers))
                logging.info('%s Got %s transfers for %s in %s seconds',
                             prepend_str, len(transfers), activity,
                             time.time() - start_time)

                # group transfers
                logging.info('%s Starting to group transfers for %s',
                             prepend_str, activity)
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer(
                    'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))

                logging.info('%s Starting to submit transfers for %s',
                             prepend_str, activity)

                if TRANSFER_TOOL == 'fts3':
                    for external_host in grouped_jobs:
                        if not user_transfer:
                            for job in grouped_jobs[external_host]:
                                # submit transfers
                                submit_transfer(
                                    external_host=external_host,
                                    job=job,
                                    submitter='transfer_submitter',
                                    logging_prepend_str=prepend_str,
                                    timeout=timeout)
                        else:
                            for _, jobs in iteritems(
                                    grouped_jobs[external_host]):
                                # submit transfers
                                for job in jobs:
                                    submit_transfer(
                                        external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str,
                                        timeout=timeout,
                                        user_transfer_job=user_transfer)
                elif TRANSFER_TOOL == 'globus':
                    if TRANSFER_TYPE == 'bulk':
                        # build bulk job file list per external host to send to submit_transfer
                        for external_host in grouped_jobs:
                            # pad the job with job_params; irrelevant for globus but needed for further rucio parsing
                            submitjob = {
                                'files': [],
                                'job_params':
                                grouped_jobs[''][0].get('job_params')
                            }
                            for job in grouped_jobs[external_host]:
                                submitjob.get('files').append(
                                    job.get('files')[0])
                            logging.debug('submitjob: %s' % submitjob)
                            submit_transfer(external_host=external_host,
                                            job=submitjob,
                                            submitter='transfer_submitter',
                                            logging_prepend_str=prepend_str,
                                            timeout=timeout)
                    else:
                        # build single job files and individually send to submit_transfer
                        job_params = grouped_jobs[''][0].get(
                            'job_params') if grouped_jobs else None
                        for external_host in grouped_jobs:
                            for job in grouped_jobs[external_host]:
                                for file in job['files']:
                                    singlejob = {
                                        'files': [file],
                                        'job_params': job_params
                                    }
                                    logging.debug('singlejob: %s' % singlejob)
                                    submit_transfer(
                                        external_host=external_host,
                                        job=singlejob,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str,
                                        timeout=timeout)
                else:
                    logging.error(prepend_str + 'Unknown transfer tool')

                if len(transfers) < group_bulk:
                    logging.info(
                        '%s Only %s transfers for %s which is less than group bulk %s, sleep %s seconds',
                        prepend_str, len(transfers), activity, group_bulk,
                        sleep_time)
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        if once:
            break

    logging.info('%s Graceful stop requested', prepend_str)

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s Graceful stop done', prepend_str)
    return
Ejemplo n.º 38
0
def delete_from_storage(replicas,
                        prot,
                        rse_info,
                        staging_areas,
                        auto_exclude_threshold,
                        logger=logging.log):
    deleted_files = []
    rse_name = rse_info['rse']
    rse_id = rse_info['id']
    noaccess_attempts = 0
    pfns_to_bulk_delete = []
    try:
        prot.connect()
        for replica in replicas:
            # Physical deletion
            try:
                deletion_dict = {
                    'scope': replica['scope'].external,
                    'name': replica['name'],
                    'rse': rse_name,
                    'file-size': replica['bytes'],
                    'bytes': replica['bytes'],
                    'url': replica['pfn'],
                    'protocol': prot.attributes['scheme']
                }
                if replica['scope'].vo != 'def':
                    deletion_dict['vo'] = replica['scope'].vo
                logger(logging.DEBUG, 'Deletion ATTEMPT of %s:%s as %s on %s',
                       replica['scope'], replica['name'], replica['pfn'],
                       rse_name)
                start = time.time()
                # For STAGING RSEs, no physical deletion
                if rse_id in staging_areas:
                    logger(
                        logging.WARNING,
                        'Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                        replica['scope'], replica['name'], replica['pfn'],
                        rse_name)
                    deleted_files.append({
                        'scope': replica['scope'],
                        'name': replica['name']
                    })
                    continue

                if replica['pfn']:
                    pfn = replica['pfn']
                    # sign the URL if necessary
                    if prot.attributes['scheme'] == 'https' and rse_info[
                            'sign_url'] is not None:
                        pfn = get_signed_url(rse_id, rse_info['sign_url'],
                                             'delete', pfn)
                    if prot.attributes['scheme'] == 'globus':
                        pfns_to_bulk_delete.append(replica['pfn'])
                    else:
                        prot.delete(pfn)
                else:
                    logger(logging.WARNING,
                           'Deletion UNAVAILABLE of %s:%s as %s on %s',
                           replica['scope'], replica['name'], replica['pfn'],
                           rse_name)

                monitor.record_timer('daemons.reaper.delete.{scheme}.{rse}',
                                     (time.time() - start) * 1000,
                                     labels={
                                         'scheme': prot.attributes['scheme'],
                                         'rse': rse_name
                                     })
                duration = time.time() - start

                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })

                deletion_dict['duration'] = duration
                add_message('deletion-done', deletion_dict)
                logger(
                    logging.INFO,
                    'Deletion SUCCESS of %s:%s as %s on %s in %.2f seconds',
                    replica['scope'], replica['name'], replica['pfn'],
                    rse_name, duration)

            except SourceNotFound:
                duration = time.time() - start
                err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s in %.2f seconds' % (
                    replica['scope'], replica['name'], replica['pfn'],
                    rse_name, duration)
                logger(logging.WARNING, '%s', err_msg)
                deletion_dict['reason'] = 'File Not Found'
                deletion_dict['duration'] = duration
                add_message('deletion-not-found', deletion_dict)
                deleted_files.append({
                    'scope': replica['scope'],
                    'name': replica['name']
                })

            except (ServiceUnavailable, RSEAccessDenied,
                    ResourceTemporaryUnavailable) as error:
                duration = time.time() - start
                logger(logging.WARNING,
                       'Deletion NOACCESS of %s:%s as %s on %s: %s in %.2f',
                       replica['scope'], replica['name'], replica['pfn'],
                       rse_name, str(error), duration)
                deletion_dict['reason'] = str(error)
                deletion_dict['duration'] = duration
                add_message('deletion-failed', deletion_dict)
                noaccess_attempts += 1
                if noaccess_attempts >= auto_exclude_threshold:
                    logger(
                        logging.INFO,
                        'Too many (%d) NOACCESS attempts for %s. RSE will be temporarly excluded.',
                        noaccess_attempts, rse_name)
                    REGION.set('temporary_exclude_%s' % rse_id, True)
                    labels = {'rse': rse_name}
                    EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
                    break

            except Exception as error:
                duration = time.time() - start
                logger(
                    logging.CRITICAL,
                    'Deletion CRITICAL of %s:%s as %s on %s in %.2f seconds : %s',
                    replica['scope'], replica['name'], replica['pfn'],
                    rse_name, duration, str(traceback.format_exc()))
                deletion_dict['reason'] = str(error)
                deletion_dict['duration'] = duration
                add_message('deletion-failed', deletion_dict)

        if pfns_to_bulk_delete and prot.attributes['scheme'] == 'globus':
            logger(logging.DEBUG,
                   'Attempting bulk delete on RSE %s for scheme %s', rse_name,
                   prot.attributes['scheme'])
            prot.bulk_delete(pfns_to_bulk_delete)

    except (ServiceUnavailable, RSEAccessDenied,
            ResourceTemporaryUnavailable) as error:
        for replica in replicas:
            logger(logging.WARNING,
                   'Deletion NOACCESS of %s:%s as %s on %s: %s',
                   replica['scope'], replica['name'], replica['pfn'], rse_name,
                   str(error))
            payload = {
                'scope': replica['scope'].external,
                'name': replica['name'],
                'rse': rse_name,
                'file-size': replica['bytes'],
                'bytes': replica['bytes'],
                'url': replica['pfn'],
                'reason': str(error),
                'protocol': prot.attributes['scheme']
            }
            if replica['scope'].vo != 'def':
                payload['vo'] = replica['scope'].vo
            add_message('deletion-failed', payload)
        logger(logging.INFO,
               'Cannot connect to %s. RSE will be temporarly excluded.',
               rse_name)
        REGION.set('temporary_exclude_%s' % rse_id, True)
        labels = {'rse': rse_name}
        EXCLUDED_RSE_GAUGE.labels(**labels).set(1)
    finally:
        prot.close()
    return deleted_files
Ejemplo n.º 39
0
def reaper(rses, worker_number=0, child_number=0, total_children=1, chunk_size=100,
           once=False, greedy=False, scheme=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    """
    logging.info('Starting Reaper: Worker %(worker_number)s, '
                 'child %(child_number)s will work on RSEs: ' % locals() + ', '.join([rse['rse'] for rse in rses]))

    pid = os.getpid()
    thread = threading.current_thread()
    hostname = socket.gethostname()
    executable = ' '.join(sys.argv)
    # Generate a hash just for the subset of RSEs
    rse_names = [rse['rse'] for rse in rses]
    hash_executable = hashlib.sha256((sys.argv[0] + ''.join(rse_names)).encode()).hexdigest()
    sanity_check(executable=None, hostname=hostname)

    nothing_to_do = {}
    while not GRACEFUL_STOP.is_set():
        try:
            # heartbeat
            heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
            checkpoint_time = datetime.datetime.now()
            # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))

            max_deleting_rate = 0
            for rse in sort_rses(rses):
                try:
                    if checkpoint_time + datetime.timedelta(minutes=1) < datetime.datetime.now():
                        heartbeat = live(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
                        # logging.info('Reaper({0[worker_number]}/{0[child_number]}): Live gives {0[heartbeat]}'.format(locals()))
                        checkpoint_time = datetime.datetime.now()

                    if rse['id'] in nothing_to_do and nothing_to_do[rse['id']] > datetime.datetime.now():
                        continue
                    logging.info('Reaper %s-%s: Running on RSE %s %s', worker_number, child_number,
                                 rse['rse'], nothing_to_do.get(rse['id']))

                    rse_info = rsemgr.get_rse_info(rse_id=rse['id'])
                    rse_protocol = rse_core.get_rse_protocols(rse_id=rse['id'])

                    if not rse_protocol['availability_delete']:
                        logging.info('Reaper %s-%s: RSE %s is not available for deletion', worker_number, child_number, rse_info['rse'])
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        continue

                    # Temporary hack to force gfal for deletion
                    for protocol in rse_info['protocols']:
                        if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                            protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                    needed_free_space, max_being_deleted_files = None, 100
                    needed_free_space_per_child = None
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space, needed_free_space_per_child = 0, 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s', worker_number, child_number, rse['rse'])
                        else:
                            if total_children and total_children > 0:
                                needed_free_space_per_child = needed_free_space / float(total_children)

                    start = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse_id=rse['id'],
                                                          bytes=needed_free_space_per_child,
                                                          limit=max_being_deleted_files,
                                                          worker_number=child_number,
                                                          total_workers=total_children,
                                                          delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas on %s for %s bytes in %s seconds: %s replicas', worker_number, child_number, rse['rse'], needed_free_space_per_child, time.time() - start, len(replicas))

                    if not replicas:
                        nothing_to_do[rse['id']] = datetime.datetime.now() + datetime.timedelta(minutes=30)
                        logging.info('Reaper %s-%s: No replicas to delete %s. The next check will occur at %s',
                                     worker_number, child_number, rse['rse'],
                                     nothing_to_do[rse['id']])
                        continue

                    prot = rsemgr.create_protocol(rse_info, 'delete', scheme=scheme)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s', worker_number, child_number, str(files))
                        try:
                            update_replicas_states(replicas=[dict(list(replica.items()) + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files], nowait=True)
                            for replica in files:
                                try:
                                    replica['pfn'] = str(list(rsemgr.lfns2pfns(rse_settings=rse_info,
                                                                               lfns=[{'scope': replica['scope'].external, 'name': replica['name'], 'path': replica['path']}],
                                                                               operation='delete', scheme=scheme).values())[0])
                                except (ReplicaUnAvailable, ReplicaNotFound) as error:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(error))
                                    logging.warning('Reaper %s-%s: %s', worker_number, child_number, err_msg)
                                    replica['pfn'] = None

                            monitor.record_counter(counters='reaper.deletion.being_deleted', delta=len(files))

                            try:
                                deleted_files = []
                                prot.connect()
                                for replica in files:
                                    try:
                                        deletion_dict = {'scope': replica['scope'].external,
                                                         'name': replica['name'],
                                                         'rse': rse_info['rse'],
                                                         'rse_id': rse_info['id'],
                                                         'file-size': replica['bytes'],
                                                         'bytes': replica['bytes'],
                                                         'url': replica['pfn'],
                                                         'protocol': prot.attributes['scheme']}
                                        if replica['scope'].vo != 'def':
                                            deletion_dict['vo'] = replica['scope'].vo
                                        logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        start = time.time()
                                        if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                            logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion',
                                                            worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        else:
                                            if replica['pfn']:
                                                pfn = replica['pfn']
                                                # sign the URL if necessary
                                                if prot.attributes['scheme'] == 'https' and rse_info['sign_url'] is not None:
                                                    pfn = get_signed_url(rse['id'], rse_info['sign_url'], 'delete', pfn)
                                                prot.delete(pfn)
                                            else:
                                                logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        monitor.record_timer('daemons.reaper.delete.%s.%s' % (prot.attributes['scheme'], rse['rse']), (time.time() - start) * 1000)
                                        duration = time.time() - start

                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                        deletion_dict['duration'] = duration
                                        add_message('deletion-done', deletion_dict)
                                        logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s in %s seconds', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], duration)
                                    except SourceNotFound:
                                        err_msg = 'Deletion NOTFOUND of %s:%s as %s on %s' % (replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                        logging.warning(err_msg)
                                        deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                        if replica['state'] == ReplicaState.AVAILABLE:
                                            deletion_dict['reason'] = str(err_msg)
                                            add_message('deletion-failed', deletion_dict)
                                    except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except Exception as error:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                                        deletion_dict['reason'] = str(error)
                                        add_message('deletion-failed', deletion_dict)
                                    except:
                                        logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc()))
                            except (ServiceUnavailable, RSEAccessDenied, ResourceTemporaryUnavailable) as error:
                                for replica in files:
                                    logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s', worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(error))
                                    payload = {'scope': replica['scope'].external,
                                               'name': replica['name'],
                                               'rse': rse_info['rse'],
                                               'rse_id': rse_info['id'],
                                               'file-size': replica['bytes'],
                                               'bytes': replica['bytes'],
                                               'url': replica['pfn'],
                                               'reason': str(error),
                                               'protocol': prot.attributes['scheme']}
                                    if replica['scope'].vo != 'def':
                                        deletion_dict['vo'] = replica['scope'].vo
                                    add_message('deletion-failed', payload)
                                    break
                            finally:
                                prot.close()
                            start = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse_id=rse['id'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s', worker_number, child_number, rse['rse'], len(deleted_files), time.time() - start)
                            monitor.record_counter(counters='reaper.deletion.done', delta=len(deleted_files))

                        except DatabaseException as error:
                            logging.warning('Reaper %s-%s: DatabaseException %s', worker_number, child_number, str(error))
                        except UnsupportedOperation as error:
                            logging.warning('Reaper %s-%s: UnsupportedOperation %s', worker_number, child_number, str(error))
                        except:
                            logging.critical(traceback.format_exc())

                except RSENotFound as error:
                    logging.warning('Reaper %s-%s: RSE not found %s', worker_number, child_number, str(error))

                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            time.sleep(1)

        except DatabaseException as error:
            logging.warning('Reaper:  %s', str(error))
        except:
            logging.critical(traceback.format_exc())

    die(executable=executable, hostname=hostname, pid=pid, thread=thread, hash_executable=hash_executable)
    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
    return
Ejemplo n.º 40
0
def __handle_requests(reqs,
                      suspicious_patterns,
                      retry_protocol_mismatches,
                      logger=logging.log):
    """
    Used by finisher to handle terminated requests,

    :param reqs:                         List of requests.
    :param suspicious_patterns:          List of suspicious patterns.
    :param retry_protocol_mismatches:    Boolean to retry the transfer in case of protocol mismatch.
    :param prepend_str: String to prepend to logging.
    """

    failed_during_submission = [
        RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED,
        RequestState.LOST
    ]
    failed_no_submission_attempts = [
        RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES,
        RequestState.MISMATCH_SCHEME
    ]
    undeterministic_rses = __get_undeterministic_rses(logger=logger)
    rses_info, protocols = {}, {}
    replicas = {}
    for req in reqs:
        try:
            replica = {
                'scope': req['scope'],
                'name': req['name'],
                'rse_id': req['dest_rse_id'],
                'bytes': req['bytes'],
                'adler32': req['adler32'],
                'request_id': req['request_id']
            }

            replica['pfn'] = req['dest_url']
            replica['request_type'] = req['request_type']
            replica['error_message'] = None

            if req['request_type'] not in replicas:
                replicas[req['request_type']] = {}
            if req['rule_id'] not in replicas[req['request_type']]:
                replicas[req['request_type']][req['rule_id']] = []

            if req['state'] == RequestState.DONE:
                replica['state'] = ReplicaState.AVAILABLE
                replica['archived'] = False

                # for TAPE, replica path is needed
                if req['request_type'] in (
                        RequestType.TRANSFER, RequestType.STAGEIN
                ) and req['dest_rse_id'] in undeterministic_rses:
                    if req['dest_rse_id'] not in rses_info:
                        rses_info[
                            req['dest_rse_id']] = rsemanager.get_rse_info(
                                rse_id=req['dest_rse_id'])
                    pfn = req['dest_url']
                    scheme = urlparse(pfn).scheme
                    dest_rse_id_scheme = '%s_%s' % (req['dest_rse_id'], scheme)
                    if dest_rse_id_scheme not in protocols:
                        protocols[
                            dest_rse_id_scheme] = rsemanager.create_protocol(
                                rses_info[req['dest_rse_id']], 'write', scheme)
                    path = protocols[dest_rse_id_scheme].parse_pfns(
                        [pfn])[pfn]['path']
                    replica['path'] = os.path.join(path, os.path.basename(pfn))

                # replica should not be added to replicas until all info are filled
                replicas[req['request_type']][req['rule_id']].append(replica)

            # Standard failure from the transfer tool
            elif req['state'] == RequestState.FAILED:
                __check_suspicious_files(req,
                                         suspicious_patterns,
                                         logger=logger)
                tss = time.time()
                try:
                    if request_core.should_retry_request(
                            req, retry_protocol_mismatches):
                        new_req = request_core.requeue_and_archive(
                            req,
                            source_ranking_update=True,
                            retry_protocol_mismatches=retry_protocol_mismatches,
                            logger=logger)
                        # should_retry_request and requeue_and_archive are not in one session,
                        # another process can requeue_and_archive and this one will return None.
                        record_timer(
                            'daemons.conveyor.common.update_request_state.request_requeue_and_archive',
                            (time.time() - tss) * 1000)
                        logger(
                            logging.WARNING,
                            'REQUEUED DID %s:%s REQUEST %s AS %s TRY %s' %
                            (req['scope'], req['name'], req['request_id'],
                             new_req['request_id'], new_req['retry_count']))
                    else:
                        # No new_req is return if should_retry_request returns False
                        logger(
                            logging.WARNING,
                            'EXCEEDED SUBMITTING DID %s:%s REQUEST %s in state %s',
                            req['scope'], req['name'], req['request_id'],
                            req['state'])
                        replica['state'] = ReplicaState.UNAVAILABLE
                        replica['archived'] = False
                        replica['error_message'] = req['err_msg'] if req[
                            'err_msg'] else request_core.get_transfer_error(
                                req['state'])
                        replicas[req['request_type']][req['rule_id']].append(
                            replica)
                except RequestNotFound:
                    logger(logging.WARNING, 'Cannot find request %s anymore',
                           req['request_id'])

            # All other failures
            elif req['state'] in failed_during_submission or req[
                    'state'] in failed_no_submission_attempts:
                if req['state'] in failed_during_submission and req[
                        'updated_at'] > (datetime.datetime.utcnow() -
                                         datetime.timedelta(minutes=120)):
                    # To prevent race conditions
                    continue
                try:
                    tss = time.time()
                    if request_core.should_retry_request(
                            req, retry_protocol_mismatches):
                        new_req = request_core.requeue_and_archive(
                            req,
                            source_ranking_update=False,
                            retry_protocol_mismatches=retry_protocol_mismatches,
                            logger=logger)
                        record_timer(
                            'daemons.conveyor.common.update_request_state.request_requeue_and_archive',
                            (time.time() - tss) * 1000)
                        logger(
                            logging.WARNING,
                            'REQUEUED SUBMITTING DID %s:%s REQUEST %s AS %s TRY %s'
                            % (req['scope'], req['name'], req['request_id'],
                               new_req['request_id'], new_req['retry_count']))
                    else:
                        # No new_req is return if should_retry_request returns False
                        logger(
                            logging.WARNING,
                            'EXCEEDED SUBMITTING DID %s:%s REQUEST %s in state %s',
                            req['scope'], req['name'], req['request_id'],
                            req['state'])
                        replica['state'] = ReplicaState.UNAVAILABLE
                        replica['archived'] = False
                        replica['error_message'] = req['err_msg'] if req[
                            'err_msg'] else request_core.get_transfer_error(
                                req['state'])
                        replicas[req['request_type']][req['rule_id']].append(
                            replica)
                except RequestNotFound:
                    logger(logging.WARNING, 'Cannot find request %s anymore',
                           req['request_id'])

        except Exception as error:
            logger(
                logging.ERROR,
                "Something unexpected happened when handling request %s(%s:%s) at %s: %s"
                % (req['request_id'], req['scope'], req['name'],
                   req['dest_rse_id'], str(error)))

    __handle_terminated_replicas(replicas, logger=logger)
Ejemplo n.º 41
0
    def submit(self, files, job_params, timeout=None):
        """
        Submit transfers to FTS3 via JSON.

        :param files:        List of dictionaries describing the file transfers.
        :param job_params:   Dictionary containing key/value pairs, for all transfers.
        :param timeout:      Timeout in seconds.
        :returns:            FTS transfer identifier.
        """

        # FTS3 expects 'davs' as the scheme identifier instead of https
        for transfer_file in files:
            if not transfer_file['sources'] or transfer_file['sources'] == []:
                raise Exception('No sources defined')

            new_src_urls = []
            new_dst_urls = []
            for url in transfer_file['sources']:
                if url.startswith('https'):
                    new_src_urls.append(':'.join(['davs'] + url.split(':')[1:]))
                else:
                    new_src_urls.append(url)
            for url in transfer_file['destinations']:
                if url.startswith('https'):
                    new_dst_urls.append(':'.join(['davs'] + url.split(':')[1:]))
                else:
                    new_dst_urls.append(url)

            transfer_file['sources'] = new_src_urls
            transfer_file['destinations'] = new_dst_urls

        transfer_id = None
        expected_transfer_id = None
        if self.deterministic_id:
            job_params = job_params.copy()
            job_params["id_generator"] = "deterministic"
            job_params["sid"] = files[0]['metadata']['request_id']
            expected_transfer_id = self.__get_deterministic_id(job_params["sid"])
            logging.debug("Submit bulk transfers in deterministic mode, sid %s, expected transfer id: %s", job_params["sid"], expected_transfer_id)

        # bulk submission
        params_dict = {'files': files, 'params': job_params}
        params_str = json.dumps(params_dict, cls=APIEncoder)

        post_result = None
        try:
            start_time = time.time()
            post_result = requests.post('%s/jobs' % self.external_host,
                                        verify=self.verify,
                                        cert=self.cert,
                                        data=params_str,
                                        headers={'Content-Type': 'application/json'},
                                        timeout=timeout)
            record_timer('transfertool.fts3.submit_transfer.%s' % self.__extract_host(self.external_host), (time.time() - start_time) * 1000 / len(files))
        except ReadTimeout as error:
            raise TransferToolTimeout(error)
        except JSONDecodeError as error:
            raise TransferToolWrongAnswer(error)
        except Exception as error:
            logging.warn('Could not submit transfer to %s - %s' % (self.external_host, str(error)))

        if post_result and post_result.status_code == 200:
            record_counter('transfertool.fts3.%s.submission.success' % self.__extract_host(self.external_host), len(files))
            transfer_id = str(post_result.json()['job_id'])
        elif post_result and post_result.status_code == 409:
            record_counter('transfertool.fts3.%s.submission.failure' % self.__extract_host(self.external_host), len(files))
            raise DuplicateFileTransferSubmission()
        else:
            if expected_transfer_id:
                transfer_id = expected_transfer_id
                logging.warn("Failed to submit transfer to %s, will use expected transfer id %s, error: %s", self.external_host, transfer_id, post_result.text if post_result is not None else post_result)
            else:
                logging.warn("Failed to submit transfer to %s, error: %s", self.external_host, post_result.text if post_result is not None else post_result)
            record_counter('transfertool.fts3.%s.submission.failure' % self.__extract_host(self.external_host), len(files))

        if not transfer_id:
            raise TransferToolWrongAnswer('No transfer id returned by %s' % self.external_host)
        return transfer_id
Ejemplo n.º 42
0
def reaper(rses, worker_number=1, child_number=1, total_children=1, chunk_size=100, once=False, greedy=False, scheme=None, exclude_rses=None, delay_seconds=0):
    """
    Main loop to select and delete files.

    :param rses: List of RSEs the reaper should work against. If empty, it considers all RSEs.
    :param worker_number: The worker number.
    :param child_number: The child number.
    :param total_children: The total number of children created per worker.
    :param chunk_size: the size of chunk for deletion.
    :param once: If True, only runs one iteration of the main loop.
    :param greedy: If True, delete right away replicas with tombstone.
    :param scheme: Force the reaper to use a particular protocol, e.g., mock.
    :param exclude_rses: RSE expression to exclude RSEs from the Reaper.
    """
    logging.info('Starting reaper: worker %(worker_number)s, child %(child_number)s' % locals())
    while not graceful_stop.is_set():
        try:
            max_deleting_rate = 0

            for rse in rses:
                deleting_rate = 0
                rse_info = rsemgr.get_rse_info(rse['rse'])
                rse_protocol = rse_core.get_rse_protocols(rse['rse'])

                if not rse_protocol['availability_delete']:
                    logging.info('Reaper %s-%s: RSE %s is not available for deletion' % (worker_number, child_number, rse_info['rse']))
                    continue

                # Temporary hack to force gfal for deletion
                for protocol in rse_info['protocols']:
                    if protocol['impl'] == 'rucio.rse.protocols.srm.Default' or protocol['impl'] == 'rucio.rse.protocols.gsiftp.Default':
                        protocol['impl'] = 'rucio.rse.protocols.gfal.Default'

                logging.info('Reaper %s-%s: Running on RSE %s' % (worker_number, child_number, rse_info['rse']))
                try:
                    needed_free_space, max_being_deleted_files = None, 10000
                    if not greedy:
                        max_being_deleted_files, needed_free_space, used, free = __check_rse_usage(rse=rse['rse'], rse_id=rse['id'])
                        logging.info('Reaper %(worker_number)s-%(child_number)s: Space usage for RSE %(rse)s - max_being_deleted_files: %(max_being_deleted_files)s, needed_free_space: %(needed_free_space)s, used: %(used)s, free: %(free)s' % locals())
                        if needed_free_space <= 0:
                            needed_free_space = 0
                            logging.info('Reaper %s-%s: free space is above minimum limit for %s' % (worker_number, child_number, rse['rse']))

                    s = time.time()
                    with monitor.record_timer_block('reaper.list_unlocked_replicas'):
                        replicas = list_unlocked_replicas(rse=rse['rse'], bytes=needed_free_space, limit=max_being_deleted_files, worker_number=child_number, total_workers=total_children, delay_seconds=delay_seconds)
                    logging.debug('Reaper %s-%s: list_unlocked_replicas %s %s %s' % (worker_number, child_number, rse['rse'], time.time() - s, len(replicas)))

                    if not replicas:
                        logging.info('Reaper %s-%s: nothing to do for %s' % (worker_number, child_number, rse['rse']))
                        continue

                    p = rsemgr.create_protocol(rse_info, 'delete', scheme=None)
                    for files in chunks(replicas, chunk_size):
                        logging.debug('Reaper %s-%s: Running on : %s' % (worker_number, child_number, str(files)))
                        try:
                            s = time.time()
                            update_replicas_states(replicas=[dict(replica.items() + [('state', ReplicaState.BEING_DELETED), ('rse_id', rse['id'])]) for replica in files])

                            for replica in files:
                                try:
                                    replica['pfn'] = str(rsemgr.lfns2pfns(rse_settings=rse_info, lfns=[{'scope': replica['scope'], 'name': replica['name']}, ], operation='delete').values()[0])
                                except ReplicaUnAvailable as e:
                                    err_msg = 'Failed to get pfn UNAVAILABLE replica %s:%s on %s with error %s' % (replica['scope'], replica['name'], rse['rse'], str(e))
                                    logging.warning('Reaper %s-%s: %s' % (worker_number, child_number, err_msg))
                                    replica['pfn'] = None

                                add_message('deletion-planned', {'scope': replica['scope'],
                                                                 'name': replica['name'],
                                                                 'file-size': replica['bytes'],
                                                                 'url': replica['pfn'],
                                                                 'rse': rse_info['rse']})

                            # logging.debug('update_replicas_states %s' % (time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.being_deleted',  delta=len(files))

                            if not scheme:
                                try:
                                    deleted_files = []
                                    p.connect()
                                    for replica in files:
                                        try:
                                            logging.info('Reaper %s-%s: Deletion ATTEMPT of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            s = time.time()
                                            if rse['staging_area'] or rse['rse'].endswith("STAGING"):
                                                logging.warning('Reaper %s-%s: Deletion STAGING of %s:%s as %s on %s, will only delete the catalog and not do physical deletion' % (worker_number,
                                                                                                                                                                                    child_number,
                                                                                                                                                                                    replica['scope'],
                                                                                                                                                                                    replica['name'],
                                                                                                                                                                                    replica['pfn'],
                                                                                                                                                                                    rse['rse']))
                                            else:
                                                if replica['pfn']:
                                                    p.delete(replica['pfn'])
                                                else:
                                                    logging.warning('Reaper %s-%s: Deletion UNAVAILABLE of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                            monitor.record_timer('daemons.reaper.delete.%s.%s' % (p.attributes['scheme'], rse['rse']), (time.time()-s)*1000)
                                            duration = time.time() - s

                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})

                                            add_message('deletion-done', {'scope': replica['scope'],
                                                                          'name': replica['name'],
                                                                          'rse': rse_info['rse'],
                                                                          'file-size': replica['bytes'],
                                                                          'url': replica['pfn'],
                                                                          'duration': duration})
                                            logging.info('Reaper %s-%s: Deletion SUCCESS of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse']))
                                        except SourceNotFound:
                                            err_msg = 'Reaper %s-%s: Deletion NOTFOUND of %s:%s as %s on %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'])
                                            logging.warning(err_msg)
                                            deleted_files.append({'scope': replica['scope'], 'name': replica['name']})
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': err_msg})
                                        except (ServiceUnavailable, RSEAccessDenied) as e:
                                            logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except Exception as e:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                            add_message('deletion-failed', {'scope': replica['scope'],
                                                                            'name': replica['name'],
                                                                            'rse': rse_info['rse'],
                                                                            'file-size': replica['bytes'],
                                                                            'url': replica['pfn'],
                                                                            'reason': str(e)})
                                        except:
                                            logging.critical('Reaper %s-%s: Deletion CRITICAL of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(traceback.format_exc())))
                                except (ServiceUnavailable, RSEAccessDenied) as e:
                                    for replica in files:
                                        logging.warning('Reaper %s-%s: Deletion NOACCESS of %s:%s as %s on %s: %s' % (worker_number, child_number, replica['scope'], replica['name'], replica['pfn'], rse['rse'], str(e)))
                                        add_message('deletion-failed', {'scope': replica['scope'],
                                                                        'name': replica['name'],
                                                                        'rse': rse_info['rse'],
                                                                        'file-size': replica['bytes'],
                                                                        'url': replica['pfn'],
                                                                        'reason': str(e)})
                                finally:
                                    p.close()
                            s = time.time()
                            with monitor.record_timer_block('reaper.delete_replicas'):
                                delete_replicas(rse=rse['rse'], files=deleted_files)
                            logging.debug('Reaper %s-%s: delete_replicas successes %s %s %s' % (worker_number, child_number, rse['rse'], len(deleted_files), time.time() - s))
                            monitor.record_counter(counters='reaper.deletion.done',  delta=len(deleted_files))
                            deleting_rate += len(deleted_files)
                        except:
                            logging.critical(traceback.format_exc())
                    deleting_rate = deleting_rate * 1.0 / max_being_deleted_files
                    if deleting_rate > max_deleting_rate:
                        max_deleting_rate = deleting_rate
                except:
                    logging.critical(traceback.format_exc())

            if once:
                break

            logging.info(" Reaper %s-%s: max_deleting_rate: %s " % (worker_number, child_number, max_deleting_rate))
            sleep_time = int((1 - max_deleting_rate) * 60 + 1)
            time.sleep(sleep_time)

        except:
            logging.critical(traceback.format_exc())

    logging.info('Graceful stop requested')
    logging.info('Graceful stop done')
Ejemplo n.º 43
0
 def test_record_timer_message(self):
     """MONITOR (CORE): Send a timer message to graphite """
     monitor.record_timer('test.runtime', 500)
Ejemplo n.º 44
0
                tss = time.time()
                logging.debug('UPDATE REPLICA STATE DID %s:%s RSE %s' % (response['scope'], response['name'], rse_update_name))

                # make sure we do not leave the transaction
                try:
                    # try quickly
                    replica.update_replicas_states([{'rse': rse_update_name,
                                                     'scope': response['scope'],
                                                     'name': response['name'],
                                                     'state': ReplicaState.AVAILABLE}],
                                                   nowait=False,
                                                   session=session)
                except UnsupportedOperation:
                    # replica not found, but it has been transferred because cancel came too late.
                    # so we need to register the replica and schedule it for deletion again
                    record_timer('daemons.conveyor.common.update_request_state.replica-update_replicas_states', (time.time()-tss)*1000)
                    logging.warn('DID %s:%s AT RSE %s NOT FOUND - Registering replica and scheduling for immediate deletion' % (response['scope'],
                                                                                                                                response['name'],
                                                                                                                                rse_name))
                    did_meta = None
                    try:
                        did_meta = did.get_metadata(response['scope'],
                                                    response['name'],
                                                    session=session)
                    except:
                        logging.critical('DID %s:%s NOT FOUND - Cannot re-register replica - potential dark data' % (response['scope'],
                                                                                                                     response['name']))
                        raise

                    if did_meta:
                        try:
Ejemplo n.º 45
0
def poller(once=False, activities=None, sleep_time=60,
           fts_bulk=100, db_bulk=1000, older_than=60, activity_shares=None):
    """
    Main loop to check the status of a transfer primitive with a transfertool.
    """

    try:
        timeout = config_get('conveyor', 'poll_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    executable = 'conveyor-poller'
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)
    if activity_shares:
        activities.sort()
        executable += '--activity_shares' + str(activity_shares)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])
    logging.info(prepend_str + 'Poller starting - db_bulk (%i) fts_bulk (%i) timeout (%s)' % (db_bulk, fts_bulk, timeout))

    time.sleep(10)  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

    logging.info(prepend_str + 'Poller started')

    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'], heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                start_time = time.time()
                logging.debug(prepend_str + 'Start to poll transfers older than %i seconds for activity %s' % (older_than, activity))
                transfs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                                state=[RequestState.SUBMITTED],
                                                limit=db_bulk,
                                                older_than=datetime.datetime.utcnow() - datetime.timedelta(seconds=older_than),
                                                total_workers=heart_beat['nr_threads'], worker_number=heart_beat['assign_thread'],
                                                mode_all=False, hash_variable='id',
                                                activity=activity,
                                                activity_shares=activity_shares)

                record_timer('daemons.conveyor.poller.000-get_next', (time.time() - start_time) * 1000)

                if transfs:
                    logging.debug(prepend_str + 'Polling %i transfers for activity %s' % (len(transfs), activity))

                xfers_ids = {}
                for transf in transfs:
                    if not transf['external_host'] in xfers_ids:
                        xfers_ids[transf['external_host']] = []
                    xfers_ids[transf['external_host']].append((transf['external_id'], transf['request_id']))

                for external_host in xfers_ids:
                    external_ids = list({trf[0] for trf in xfers_ids[external_host]})
                    request_ids = [trf[1] for trf in xfers_ids[external_host]]
                    for xfers in chunks(external_ids, fts_bulk):
                        # poll transfers
                        poll_transfers(external_host=external_host, xfers=xfers, prepend_str=prepend_str, request_ids=request_ids, timeout=timeout)

                if len(transfs) < fts_bulk / 2:
                    logging.info(prepend_str + "Only %s transfers for activity %s, which is less than half of the bulk %s, will sleep %s seconds" % (len(transfs), activity, fts_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time() + sleep_time
        except Exception:
            logging.critical(prepend_str + "%s" % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 46
0
def update_bad_request(req, dest_rse, new_state, detail, session=None):
    if new_state == RequestState.FAILED:
        request.set_request_state(req['request_id'], new_state, session=session)

        activity = 'default'
        if req['attributes']:
            if type(req['attributes']) is dict:
                req_attributes = json.loads(json.dumps(req['attributes']))
            else:
                req_attributes = json.loads(str(req['attributes']))
            activity = req_attributes['activity'] if req_attributes['activity'] else 'default'

        tss = time.time()
        add_message('transfer-failed', {'activity': activity,
                                        'request-id': req['request_id'],
                                        'checksum-adler': None,
                                        'checksum-md5': None,
                                        'dst-rse': dest_rse,
                                        'dst-url': None,
                                        'name': req['name'],
                                        'guid': None,
                                        'file-size': None,
                                        'previous-request-id': req['request_id'],
                                        'protocol': None,
                                        'reason': detail,
                                        'transfer-link': None,
                                        'scope': req['scope'],
                                        'src-rse': None,
                                        'src-url': None,
                                        'tool-id': 'rucio-conveyor',
                                        'transfer-endpoint': config_get('conveyor', 'ftshosts'),
                                        'transfer-id': None},
                    session=session)

        request.archive_request(req['request_id'], session=session)
        logging.error('BAD DID %s:%s REQUEST %s details: %s' % (req['scope'],
                                                                req['name'],
                                                                req['request_id'],
                                                                detail))
        try:
            replica.update_replicas_states([{'rse': dest_rse,
                                             'scope': req['scope'],
                                             'name': req['name'],
                                             'state': ReplicaState.UNAVAILABLE}],
                                           session=session)
        except:
            logging.critical("Could not update replica state for failed transfer %s:%s at %s (%s)" % (req['scope'],
                                                                                                      req['name'],
                                                                                                      dest_rse,
                                                                                                      traceback.format_exc()))
            raise

        tss = time.time()
        try:
            lock.failed_transfer(req['scope'],
                                 req['name'],
                                 req['dest_rse_id'],
                                 session=session)
        except:
            logging.warn('Could not update lock for failed transfer %s:%s at %s (%s)' % (req['scope'],
                                                                                         req['name'],
                                                                                         dest_rse,
                                                                                         traceback.format_exc()))
            raise
        record_timer('daemons.conveyor.common.update_request_state.lock-failed_transfer', (time.time()-tss)*1000)
Ejemplo n.º 47
0
def stager(once=False,
           rses=None,
           mock=False,
           bulk=100,
           group_bulk=1,
           group_policy='rule',
           source_strategy=None,
           activities=None,
           sleep_time=600,
           retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None

    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}
    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Stager starting with bring_online %s seconds' %
                 (bring_online))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Stager started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logging.info(prepend_str +
                             'Starting to get stagein transfers for %s' %
                             (activity))
                start_time = time.time()
                transfers = __get_stagein_transfers(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    mock=mock,
                    schemes=scheme,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.stager.get_stagein_transfers',
                               len(transfers))
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.transfers',
                    len(transfers))
                logging.info(prepend_str + 'Got %s stagein transfers for %s' %
                             (len(transfers), activity))

                # group transfers
                logging.info(prepend_str +
                             'Starting to group transfers for %s' % (activity))
                start_time = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer('daemons.conveyor.stager.bulk_group_transfer',
                             (time.time() - start_time) * 1000 /
                             (len(transfers) if transfers else 1))

                logging.info(prepend_str +
                             'Starting to submit transfers for %s' %
                             (activity))
                # submit transfers
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        submit_transfer(external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str)

                if len(transfers) < group_bulk:
                    logging.info(
                        prepend_str +
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 48
0
def automatix(sites, inputfile, sleep_time, account, worker_number=1, total_workers=1, once=False, dataset_lifetime=None, set_metadata=False):
    sleep(sleep_time * (total_workers - worker_number) / total_workers)

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():
        heartbeat.live(executable, hostname, pid, hb_thread)
        starttime = time()
        prepend_str = 'Thread [%i/%i] : ' % (worker_number, total_workers)
        logging.info(prepend_str + 'Getting data distribution')
        probabilities, data = get_data_distribution(inputfile)
        logging.debug(prepend_str + 'Probabilities %s' % (probabilities))
        account = 'root'
        scope = 'tests'
        totretries = 3
        status = False

        for site in sites:

            for retry in range(0, totretries):
                start_time = time()
                tmpdir = tempfile.mkdtemp()
                logging.info(prepend_str + 'Running on site %s' % (site))
                dic = choose_element(probabilities, data)
                metadata = dic['metadata']
                metadata['version'] = str(random.randint(0, 1000))
                metadata['run_number'] = str(random.randint(0, 100000))
                metadata['stream_name'] = 'automatix_stream'
                metadata['campaign'] = 'automatix_campaign'
                try:
                    nbfiles = dic['nbfiles']
                except KeyError:
                    nbfiles = 2
                    logging.warning(prepend_str + 'No nbfiles defined in the configuration, will use 2')
                try:
                    filesize = dic['filesize']
                except KeyError:
                    filesize = 1000000
                    logging.warning(prepend_str + 'No filesize defined in the configuration, will use 1M files')
                dsn = 'tests:%s.%s.%s.%s.%s.%s' % (metadata['project'], metadata['run_number'], metadata['stream_name'], metadata['prod_step'], metadata['datatype'], metadata['version'])
                fnames = []
                lfns = []
                for dummy_nbfile in range(nbfiles):
                    fname = '%s.%s' % (metadata['datatype'], generate_uuid())
                    lfns.append(fname)
                    fname = '%s/%s' % (tmpdir, fname)
                    logging.info(prepend_str + 'Generating file %(fname)s in dataset %(dsn)s' % locals())
                    generate_file(fname, filesize)
                    fnames.append(fname)
                logging.info(prepend_str + 'Upload %s to %s' % (dsn, site))
                status = upload(files=lfns, scope=scope, metadata=metadata, rse=site, account=account, source_dir=tmpdir, worker_number=worker_number, total_workers=total_workers, dataset_lifetime=dataset_lifetime, did=dsn, set_metadata=set_metadata)
                for fname in fnames:
                    remove(fname)
                rmdir(tmpdir)
                if status:
                    monitor.record_counter(counters='automatix.addnewdataset.done', delta=1)
                    monitor.record_counter(counters='automatix.addnewfile.done', delta=nbfiles)
                    monitor.record_timer('automatix.datasetinjection', (time() - start_time) * 1000)
                    break
                else:
                    logging.info(prepend_str + 'Failed to upload files. Will retry another time (attempt %s/%s)' % (str(retry + 1), str(totretries)))
        if once is True:
            logging.info(prepend_str + 'Run with once mode. Exiting')
            break
        tottime = time() - starttime
        if status:
            logging.info(prepend_str + 'It took %s seconds to upload one dataset on %s' % (str(tottime), str(sites)))
            if sleep_time > tottime:
                logging.info(prepend_str + 'Will sleep for %s seconds' % (str(sleep_time - tottime)))
                sleep(sleep_time - tottime)
        else:
            logging.info(prepend_str + 'Retrying a new upload')
    heartbeat.die(executable, hostname, pid, hb_thread)
    logging.info(prepend_str + 'Graceful stop requested')
    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 49
0
def finisher(once=False, sleep_time=60, activities=None, bulk=100, db_bulk=1000):
    """
    Main loop to update the replicas and rules based on finished requests.
    """
    try:
        conveyor_config = {item[0]: item[1] for item in items('conveyor')}
    except ConfigNotFound:
        logging.info('No configuration found for conveyor')
        conveyor_config = {}

    # Get suspicious patterns
    suspicious_patterns = conveyor_config.get('suspicious_pattern', [])
    if suspicious_patterns:
        pattern = str(suspicious_patterns)
        patterns = pattern.split(",")
        suspicious_patterns = [re.compile(pat.strip()) for pat in patterns]
    logging.debug("Suspicious patterns: %s" % [pat.pattern for pat in suspicious_patterns])

    retry_protocol_mismatches = conveyor_config.get('retry_protocol_mismatches', False)

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    # Make an initial heartbeat so that all finishers have the correct worker number on the next try
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
    logging.info('%s Finisher starting - db_bulk(%i) bulk (%i)', prepend_str, db_bulk, bulk)

    graceful_stop.wait(10)
    while not graceful_stop.is_set():

        start_time = time.time()
        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread, older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'] + 1, heart_beat['nr_threads'])
            logging.debug('%s Starting new cycle', prepend_str)
            if activities is None:
                activities = [None]

            for activity in activities:
                logging.debug('%s Working on activity %s', prepend_str, activity)
                time1 = time.time()
                reqs = request_core.get_next(request_type=[RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT],
                                             state=[RequestState.DONE, RequestState.FAILED, RequestState.LOST, RequestState.SUBMITTING,
                                                    RequestState.SUBMISSION_FAILED, RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES],
                                             limit=db_bulk,
                                             older_than=datetime.datetime.utcnow(),
                                             total_workers=heart_beat['nr_threads'] - 1,
                                             worker_number=heart_beat['assign_thread'],
                                             mode_all=True,
                                             hash_variable='rule_id')
                record_timer('daemons.conveyor.finisher.000-get_next', (time.time() - time1) * 1000)
                time2 = time.time()
                if reqs:
                    logging.debug('%s Updating %i requests for activity %s', prepend_str, len(reqs), activity)

                for chunk in chunks(reqs, bulk):
                    try:
                        time3 = time.time()
                        __handle_requests(chunk, suspicious_patterns, retry_protocol_mismatches, prepend_str)
                        record_timer('daemons.conveyor.finisher.handle_requests', (time.time() - time3) * 1000 / (len(chunk) if chunk else 1))
                        record_counter('daemons.conveyor.finisher.handle_requests', len(chunk))
                    except Exception as error:
                        logging.warn('%s %s', prepend_str, str(error))
                if reqs:
                    logging.debug('%s Finish to update %s finished requests for activity %s in %s seconds', prepend_str, len(reqs), activity, time.time() - time2)

        except (DatabaseException, DatabaseError) as error:
            if re.match('.*ORA-00054.*', error.args[0]) or re.match('.*ORA-00060.*', error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
                logging.warn('%s Lock detected when handling request - skipping: %s', prepend_str, str(error))
            else:
                logging.error('%s %s', prepend_str, traceback.format_exc())
        except Exception as error:
            logging.critical('%s %s', prepend_str, str(error))

        if once:
            return

        end_time = time.time()
        time_diff = end_time - start_time
        if time_diff < sleep_time:
            logging.info('%s Sleeping for a while :  %s seconds', prepend_str, (sleep_time - time_diff))
            graceful_stop.wait(sleep_time - time_diff)

    logging.info('%s Graceful stop requests', prepend_str)

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s Graceful stop done', prepend_str)
Ejemplo n.º 50
0
 def test_monitor_record_timer_message(self):
     """MONITOR (CORE): Send a timer message to graphite """
     monitor.record_timer('test.runtime', 500)
Ejemplo n.º 51
0
def run_once(bulk, db_bulk, suspicious_patterns, retry_protocol_mismatches,
             heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    try:
        logger(logging.DEBUG, 'Working on activity %s', activity)
        time1 = time.time()
        reqs = request_core.get_next(
            request_type=[
                RequestType.TRANSFER, RequestType.STAGEIN, RequestType.STAGEOUT
            ],
            state=[
                RequestState.DONE, RequestState.FAILED, RequestState.LOST,
                RequestState.SUBMITTING, RequestState.SUBMISSION_FAILED,
                RequestState.NO_SOURCES, RequestState.ONLY_TAPE_SOURCES,
                RequestState.MISMATCH_SCHEME
            ],
            limit=db_bulk,
            older_than=datetime.datetime.utcnow(),
            total_workers=total_workers,
            worker_number=worker_number,
            mode_all=True,
            hash_variable='rule_id')
        record_timer('daemons.conveyor.finisher.get_next',
                     (time.time() - time1) * 1000)
        time2 = time.time()
        if reqs:
            logger(logging.DEBUG, 'Updating %i requests for activity %s',
                   len(reqs), activity)

        for chunk in chunks(reqs, bulk):
            try:
                worker_number, total_workers, logger = heartbeat_handler.live()
                time3 = time.time()
                __handle_requests(chunk,
                                  suspicious_patterns,
                                  retry_protocol_mismatches,
                                  logger=logger)
                record_timer('daemons.conveyor.finisher.handle_requests_time',
                             (time.time() - time3) * 1000 /
                             (len(chunk) if chunk else 1))
                record_counter('daemons.conveyor.finisher.handle_requests',
                               delta=len(chunk))
            except Exception as error:
                logger(logging.WARNING, '%s', str(error))
        if reqs:
            logger(
                logging.DEBUG,
                'Finish to update %s finished requests for activity %s in %s seconds',
                len(reqs), activity,
                time.time() - time2)

    except (DatabaseException, DatabaseError) as error:
        if re.match('.*ORA-00054.*', error.args[0]) or re.match(
                '.*ORA-00060.*',
                error.args[0]) or 'ERROR 1205 (HY000)' in error.args[0]:
            logger(logging.WARNING,
                   'Lock detected when handling request - skipping: %s',
                   str(error))
        else:
            raise
    return True
Ejemplo n.º 52
0
def request_transfer(once=False, src=None, dst=None):
    """
    Main loop to request a new transfer.
    """

    logging.info('request: starting')

    site_a = 'RSE%s' % generate_uuid().upper()
    site_b = 'RSE%s' % generate_uuid().upper()

    scheme = 'https'
    impl = 'rucio.rse.protocols.webdav.Default'
    if not src.startswith('https://'):
        scheme = 'srm'
        impl = 'rucio.rse.protocols.srm.Default'
        srctoken = src.split(':')[0]
        dsttoken = dst.split(':')[0]

    tmp_proto = {
        'impl': impl,
        'scheme': scheme,
        'domains': {
            'lan': {'read': 1, 'write': 1, 'delete': 1},
            'wan': {'read': 1, 'write': 1, 'delete': 1}}}

    rse.add_rse(site_a)
    tmp_proto['hostname'] = src.split(':')[1][2:]
    tmp_proto['port'] = src.split(':')[2].split('/')[0]
    tmp_proto['prefix'] = '/'.join([''] + src.split(':')[2].split('/')[1:])
    if scheme == 'srm':
        tmp_proto['extended_attributes'] = {'space_token': srctoken,
                                            'web_service_path': ''}
    rse.add_protocol(site_a, tmp_proto)

    tmp_proto = {
        'impl': impl,
        'scheme': scheme,
        'domains': {
            'lan': {'read': 1, 'write': 1, 'delete': 1},
            'wan': {'read': 1, 'write': 1, 'delete': 1}}}

    rse.add_rse(site_b)
    tmp_proto['hostname'] = dst.split(':')[1][2:]
    tmp_proto['port'] = dst.split(':')[2].split('/')[0]
    tmp_proto['prefix'] = '/'.join([''] + dst.split(':')[2].split('/')[1:])
    if scheme == 'srm':
        tmp_proto['extended_attributes'] = {'space_token': dsttoken,
                                            'web_service_path': ''}
    rse.add_protocol(site_b, tmp_proto)

    si = rsemanager.get_rse_info(site_a)

    session = get_session()

    logging.info('request: started')

    while not graceful_stop.is_set():

        try:

            ts = time.time()

            tmp_name = generate_uuid()

            # add a new dataset
            did.add_did(scope='mock', name='dataset-%s' % tmp_name,
                        type=DIDType.DATASET, account='root', session=session)

            # construct PFN
            pfn = rsemanager.lfns2pfns(si, lfns=[{'scope': 'mock', 'name': 'file-%s' % tmp_name}])['mock:file-%s' % tmp_name]

            # create the directories if needed
            p = rsemanager.create_protocol(si, operation='write', scheme=scheme)
            p.connect()
            try:
                p.mkdir(pfn)
            except:
                pass

            # upload the test file
            try:
                fp = os.path.dirname(config_get('injector', 'file'))
                fn = os.path.basename(config_get('injector', 'file'))
                p.put(fn, pfn, source_dir=fp)
            except:
                logging.critical('Could not upload, removing temporary DID: %s' % str(sys.exc_info()))
                did.delete_dids([{'scope': 'mock', 'name': 'dataset-%s' % tmp_name}], account='root', session=session)
                break

            # add the replica
            replica.add_replica(rse=site_a, scope='mock', name='file-%s' % tmp_name,
                                bytes=config_get_int('injector', 'bytes'),
                                adler32=config_get('injector', 'adler32'),
                                md5=config_get('injector', 'md5'),
                                account='root', session=session)

            # to the dataset
            did.attach_dids(scope='mock', name='dataset-%s' % tmp_name, dids=[{'scope': 'mock',
                                                                               'name': 'file-%s' % tmp_name,
                                                                               'bytes': config_get('injector', 'bytes')}],
                            account='root', session=session)

            # add rule for the dataset
            ts = time.time()

            rule.add_rule(dids=[{'scope': 'mock', 'name': 'dataset-%s' % tmp_name}],
                          account='root',
                          copies=1,
                          rse_expression=site_b,
                          grouping='ALL',
                          weight=None,
                          lifetime=None,
                          locked=False,
                          subscription_id=None,
                          activity='mock-injector',
                          session=session)

            logging.info('added rule for %s for DID mock:%s' % (site_b, tmp_name))
            record_timer('daemons.mock.conveyorinjector.add_rule', (time.time()-ts)*1000)

            record_counter('daemons.mock.conveyorinjector.request_transfer')

            session.commit()
        except:
            session.rollback()
            logging.critical(traceback.format_exc())

        if once:
            return

    logging.info('request: graceful stop requested')

    logging.info('request: graceful stop done')