Example #1
0
    def message_handle(self, msg):
        record_counter('daemons.cache.consumer.message_handle.message')

        try:
            if isinstance(msg, dict) and 'operation' in msg.keys():
                if msg['operation'] == 'add_replicas':
                    validate_schema(name='cache_add_replicas', obj=msg)
                    if 'rse' in msg.keys() and 'files' in msg.keys():
                        logging.debug('[%s] %s %s %s' % (self.__broker, msg['operation'], msg['rse'], msg['files']))
                        try:
                            if not self.__rse_volatile.get_volatile(msg['rse']):
                                logging.error("%s volatile is not True, Rucio Cache should not update it." % (msg['rse']))
                            else:
                                cache_add_replicas(rse=msg['rse'], files=msg['files'], account=self.__account, lifetime=msg['lifetime'])
                        except Exception, e:
                            logging.error('[%s] %s %s %s %s with error details: %s' % (self.__broker, msg['operation'], msg['rse'], msg['files'], str(e), str(format_exc())))

                if msg['operation'] == 'delete_replicas':
                    validate_schema(name='cache_delete_replicas', obj=msg)
                    if 'rse' in msg.keys() and 'files' in msg.keys():
                        logging.debug('[%s] %s %s %s' % (self.__broker, msg['operation'], msg['rse'], msg['files']))
                        try:
                            if not self.__rse_volatile.get_volatile(msg['rse']):
                                logging.error("%s volatile is not True, Rucio Cache should not update it." % (msg['rse']))
                            else:
                                cache_delete_replicas(rse=msg['rse'], files=msg['files'], account=self.__account)
                        except Exception, e:
                            logging.error('[%s] %s %s %s %s with error details: %s' % (self.__broker, msg['operation'], msg['rse'], msg['files'], str(e), str(format_exc())))
Example #2
0
def query_request_details(request_id, transfertool='fts3', session=None):
    """
    Query the detailed status of a request. Can also be done after the
    external transfer has finished.

    :param request_id: Request-ID as a 32 character hex string.
    :param transfertool: Transfertool name as a string.
    :param session: Database session to use.
    :returns: Detailed request status information as a dictionary.
    """

    record_counter('core.request.query_request_details')

    req = get_request(request_id, session=session)

    if not req:
        return

    if transfertool == 'fts3':
        ts = time.time()
        tmp = fts3.query_details(req['external_id'], req['external_host'])
        record_timer('core.request.query_details_fts3', (time.time() - ts) * 1000)
        return tmp

    raise NotImplementedError
Example #3
0
def get_request_by_did(scope, name, rse, rse_id=None, request_type=None, session=None):
    """
    Retrieve a request by its DID for a destination RSE.

    :param scope: The scope of the data identifier.
    :param name: The name of the data identifier.
    :param rse: The destination RSE of the request.
    :param rse_id: The destination RSE ID of the request. Overrides rse param!
    :param request_type: The type of request as rucio.db.constants.RequestType.
    :param session: Database session to use.
    :returns: Request as a dictionary.
    """

    record_counter('core.request.get_request_by_did')
    try:
        tmp = session.query(models.Request).filter_by(scope=scope,
                                                      name=name)

        if rse_id:
            tmp = tmp.filter_by(dest_rse_id=rse_id)
        else:
            tmp = tmp.filter_by(dest_rse_id=get_rse_id(rse))

        if request_type:
            tmp = tmp.filter_by(request_type=request_type)

        tmp = tmp.first()
        if not tmp:
            return
        else:
            tmp = dict(tmp)
            tmp.pop('_sa_instance_state')
            return tmp
    except IntegrityError, e:
        raise RucioException(e.args)
Example #4
0
def query(tid, session):
    """
    Query the transfer job information of a single job. Has a chance to progress the job from QUEUED to either DONE or FAILED.

    :param tid: The transfer job id.
    :returns: The transfer job information.
    """

    record_counter('daemons.mock.fts3.query')

    ts = time.time()
    new_state = random.sample(sum([[FTSState.FINISHED]*15, [FTSState.FAILED]*3, [FTSState.FINISHEDDIRTY]*2, [FTSState.ACTIVE]*80], []), 1)[0]
    record_timer('daemons.mock.fts3.query.000-random_sample', (time.time()-ts)*1000)

    ts = time.time()

    query = session.query(test_models.MockFTSTransfer).filter(and_(test_models.MockFTSTransfer.transfer_id == tid,
                                                                   or_(test_models.MockFTSTransfer.state == FTSState.SUBMITTED,
                                                                       test_models.MockFTSTransfer.state == FTSState.ACTIVE)))

    if query.update({'state': new_state,
                     'last_modified': datetime.datetime.utcnow()}) == 0:
        return None

    r = {'job_state': str(new_state)}
    if new_state == FTSState.FAILED or new_state == FTSState.FINISHEDDIRTY:
        r['reason'] = 'Mock FTS decided to kill your transfer.'
        r['files'] = [{'source_surl': 'mock_src', 'dest_surl': 'mock_dest', 'reason': 'mock failure'}]

    return r
Example #5
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting' % worker_number)
    logging.info('Undertaker(%s): started' % worker_number)
    while not graceful_stop.is_set():
        try:
            dids = list_expired_dids(worker_number=worker_number, total_workers=total_workers, limit=10000)
            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.' % worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete' % (worker_number, len(chunk)))
                    delete_dids(dids=chunk, account='root')
                    logging.info('Undertaker(%s): Delete %s dids' % (worker_number, len(chunk)))
                    record_counter(counters='undertaker.delete_dids',  delta=len(chunk))
                except DatabaseException, e:
                    logging.error('Undertaker(%s): Got database error %s.' % (worker_number, str(e)))
        except:
            logging.error(traceback.format_exc())
            time.sleep(1)

        if once:
            break

    logging.info('Undertaker(%s): graceful stop requested' % worker_number)
    logging.info('Undertaker(%s): graceful stop done' % worker_number)
Example #6
0
def delete_dids(dids, account, session=None):
    """
    Delete data identifiers

    :param dids: The list of dids to delete.
    :param account: The account.
    :param session: The database session in use.
    """

    rule_id_clause = []
    content_clause = []
    parent_content_clause = []
    did_clause = []
    for did in dids:
        logging.info('Removing did %s:%s' % (did['scope'], did['name']))
        did_clause.append(and_(models.DataIdentifier.scope == did['scope'], models.DataIdentifier.name == did['name']))
        parent_content_clause.append(and_(models.DataIdentifierAssociation.child_scope == did['scope'], models.DataIdentifierAssociation.child_name == did['name']))
        rule_id_clause.append(and_(models.ReplicationRule.scope == did['scope'], models.ReplicationRule.name == did['name']))
        content_clause.append(and_(models.DataIdentifierAssociation.scope == did['scope'], models.DataIdentifierAssociation.name == did['name']))

        # Send message for AMI
        add_message('ERASE', {'account': account,
                              'scope': did['scope'],
                              'name': did['name']},
                    session=session)

    # Delete rules on did
    if rule_id_clause:
        with record_timer_block('undertaker.rules'):
            for (rule_id, scope, name, rse_expression, ) in session.query(models.ReplicationRule.id,
                                                                          models.ReplicationRule.scope,
                                                                          models.ReplicationRule.name,
                                                                          models.ReplicationRule.rse_expression).filter(or_(*rule_id_clause)):
                logging.debug('Removing rule %s for did %s:%s on RSE-Expression %s' % (str(rule_id), scope, name, rse_expression))
                rucio.core.rule.delete_rule(rule_id=rule_id, nowait=True, session=session)

    # Detach from parent dids:
    existing_parent_dids = False
    if parent_content_clause:
        with record_timer_block('undertaker.parent_content'):
            for parent_did in session.query(models.DataIdentifierAssociation).filter(or_(*parent_content_clause)):
                existing_parent_dids = True
                detach_dids(scope=parent_did.scope, name=parent_did.name, dids=[{'scope': parent_did.child_scope, 'name': parent_did.child_name}], session=session)

    # Remove content
    if content_clause:
        with record_timer_block('undertaker.content'):
            rowcount = session.query(models.DataIdentifierAssociation).filter(or_(*content_clause)).\
                delete(synchronize_session=False)
        record_counter(counters='undertaker.content.rowcount',  delta=rowcount)

    # remove data identifier
    if existing_parent_dids:
        # Exit method early to give Judge time to remove locks (Otherwise, due to foreign keys, did removal does not work
        logging.debug('Leaving delete_dids early for Judge-Evaluator checks')
        return
    with record_timer_block('undertaker.dids'):
        rowcount = session.query(models.DataIdentifier).filter(or_(*did_clause)).\
            filter(or_(models.DataIdentifier.did_type == DIDType.CONTAINER, models.DataIdentifier.did_type == DIDType.DATASET)).\
            delete(synchronize_session=False)
Example #7
0
def necromancer(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash, identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    param worker_number: The number of the worker (thread).
    param total_number: The total number of workers (threads).
    chunk_size: The chunk of the size to process.
    once: To run only once
    """
    sleep_time = 60
    while not graceful_stop.is_set():
        stime = time.time()
        try:
            replicas = list_bad_replicas(limit=chunk_size, worker_number=worker_number, total_workers=total_workers)
            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info('Thread [%i/%i] : Working on %s:%s on %s' % (worker_number, total_workers, scope, name, rse))
                rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])]
                if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]):
                    logging.info('Thread [%i/%i] : File %s:%s has no other replicas, it will be marked as lost' % (worker_number, total_workers, scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile',  delta=1)
                    except DatabaseException, e:
                        logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e)))
                else:
                    logging.info('Thread [%i/%i] : File %s:%s can be recovered. Available sources : %s' % (worker_number, total_workers, scope, name, str(rep[0]['rses'])))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id)
                        monitor.record_counter(counters='necromancer.badfiles.recovering',  delta=1)
                    except DatabaseException, e:
                        logging.info('Thread [%i/%i] : %s' % (worker_number, total_workers, str(e)))
            logging.info('Thread [%i/%i] : It took %s seconds to process %s replicas' % (worker_number, total_workers, str(time.time() - stime), str(len(replicas))))
Example #8
0
def requeue_and_archive(request_id, session=None):
    """
    Requeue and archive a failed request.
    TODO: Multiple requeue.

    :param request_id: Original request ID as a string.
    :param session: Database session to use.
    """

    record_counter('core.request.requeue_request')
    new_req = get_request(request_id, session=session)

    if new_req:
        archive_request(request_id, session=session)
        new_req['request_id'] = generate_uuid()
        new_req['previous_attempt_id'] = request_id
        if new_req['retry_count'] is None:
            new_req['retry_count'] = 1
        else:
            new_req['retry_count'] += 1

        # hardcoded for now - only requeue a couple of times
        if new_req['retry_count'] < 4:
            queue_requests([new_req], session=session)
            return new_req
Example #9
0
def submit_transfers(transfers, transfertool='fts3', job_metadata={}, session=None):
    """
    Submit transfer request to a transfertool.

    :param transfers: Dictionary containing request metadata.
    :param transfertool: Transfertool as a string.
    :param job_metadata: Metadata key/value pairs for all files as a dictionary.
    :param session: Database session to use.
    :returns: Transfertool external ID.
    """

    record_counter('core.request.submit_transfer')

    transfer_id = None

    if transfertool == 'fts3':
        ts = time.time()
        transfer_ids = fts3.submit_transfers(transfers, job_metadata)
        record_timer('core.request.submit_transfers_fts3', (time.time() - ts) * 1000)

    for transfer_id in transfer_ids:
        session.query(models.Request)\
               .filter_by(id=transfer_id)\
               .update({'state': RequestState.SUBMITTED,
                        'external_id': transfer_ids[transfer_id]['external_id'],
                        'external_host': transfer_ids[transfer_id]['external_host'],
                        'dest_url': transfer_ids[transfer_id]['dest_urls'][0]},
                       synchronize_session=False)

    return transfer_ids
Example #10
0
def query(transfer_id, transfer_host):
    """
    Query the status of a transfer in FTS3 via JSON.

    :param transfer_id: FTS transfer identifier as a string.
    :param transfer_host: FTS server as a string.
    :returns: Transfer status information as a dictionary.
    """

    job = None

    if transfer_host.startswith("https://"):
        job = requests.get(
            "%s/jobs/%s" % (transfer_host, transfer_id),
            verify=False,
            cert=(__USERCERT, __USERCERT),
            headers={"Content-Type": "application/json"},
        )
    else:
        job = requests.get("%s/jobs/%s" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"})
    if job and job.status_code == 200:
        record_counter("transfertool.fts3.%s.query.success" % __extract_host(transfer_host))
        return job.json()

    record_counter("transfertool.fts3.%s.query.failure" % __extract_host(transfer_host))
    raise Exception("Could not retrieve transfer information: %s", job.content)
Example #11
0
def cancel_request_did(scope, name, dest_rse_id, request_type=RequestType.TRANSFER, session=None):
    """
    Cancel a request based on a DID and request type.

    :param scope: Data identifier scope as a string.
    :param name: Data identifier name as a string.
    :param dest_rse_id: RSE id as a string.
    :param request_type: Type of the request.
    :param session: Database session to use.
    """

    record_counter('core.request.cancel_request_did')

    reqs = None
    try:
        reqs = session.query(models.Request.id,
                             models.Request.external_id,
                             models.Request.external_host).filter_by(scope=scope,
                                                                     name=name,
                                                                     dest_rse_id=dest_rse_id,
                                                                     request_type=request_type).all()
        if not reqs:
            logging.warn('Tried to cancel non-existant request for DID %s:%s at RSE ID %s' % (scope, name, dest_rse_id))
    except IntegrityError, e:
        raise RucioException(e.args)
Example #12
0
def version(transfer_host):
    """
    Returns FTS3 server information.

    :param transfer_host: FTS server as a string.

    :returns: FTS3 server information as a dictionary.
    """

    r = None

    if transfer_host.startswith("https://"):
        r = requests.get(
            "%s/" % transfer_host,
            verify=False,
            cert=(__USERCERT, __USERCERT),
            headers={"Content-Type": "application/json"},
        )
    else:
        r = requests.get("%s/" % transfer_host, headers={"Content-Type": "application/json"})

    if r and r.status_code == 200:
        record_counter("transfertool.fts3.%s.version.success" % __extract_host(transfer_host))
        return r.json()

    record_counter("transfertool.fts3.%s.version.failure" % __extract_host(transfer_host))
    raise Exception("Could not retrieve version: %s", r.content)
Example #13
0
def archive_request(request_id, session=None):
    """
    Move a request to the history table.

    :param request_id: Request-ID as a 32 character hex string.
    :param session: Database session to use.
    """

    record_counter('core.request.archive')
    req = get_request(request_id=request_id, session=session)

    if req:
        hist_request = models.Request.__history_mapper__.class_(id=req['id'],
                                                                request_type=req['request_type'],
                                                                scope=req['scope'],
                                                                name=req['name'],
                                                                dest_rse_id=req['dest_rse_id'],
                                                                attributes=req['attributes'],
                                                                state=req['state'],
                                                                external_id=req['external_id'],
                                                                retry_count=req['retry_count'],
                                                                err_msg=req['err_msg'],
                                                                previous_attempt_id=req['previous_attempt_id'],
                                                                external_host=req['external_host'],
                                                                rule_id=req['rule_id'],
                                                                activity=req['activity'],
                                                                bytes=req['bytes'],
                                                                md5=req['md5'],
                                                                adler32=req['adler32'],
                                                                dest_url=req['dest_url'])
        hist_request.save(session=session)
        try:
            session.query(models.Request).filter_by(id=request_id).delete()
        except IntegrityError, e:
            raise RucioException(e.args)
Example #14
0
def cancel(transfer_id, transfer_host):
    """
    Cancel a transfer that has been submitted to FTS via JSON.

    :param transfer_id: FTS transfer identifier as a string.
    :param transfer_host: FTS server as a string.
    """

    job = None

    if transfer_host.startswith("https://"):
        job = requests.delete(
            "%s/jobs/%s" % (transfer_host, transfer_id),
            verify=False,
            cert=(__USERCERT, __USERCERT),
            headers={"Content-Type": "application/json"},
        )
    else:
        job = requests.delete("%s/jobs/%s" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"})
    if job and job.status_code == 200:
        record_counter("transfertool.fts3.%s.cancel.success" % __extract_host(transfer_host))
        return job.json()

    record_counter("transfertool.fts3.%s.cancel.failure" % __extract_host(transfer_host))
    raise Exception("Could not cancel transfer: %s", job.content)
Example #15
0
def query_details(transfer_id, transfer_host):
    """
    Query the detailed status of a transfer in FTS3 via JSON.

    :param transfer_id: FTS transfer identifier as a string.
    :param transfer_host: FTS server as a string.
    :returns: Detailed transfer status information as a dictionary.
    """

    files = None

    if transfer_host.startswith("https://"):
        files = requests.get(
            "%s/jobs/%s/files" % (transfer_host, transfer_id),
            verify=False,
            cert=(__USERCERT, __USERCERT),
            headers={"Content-Type": "application/json"},
        )
    else:
        files = requests.get(
            "%s/jobs/%s/files" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"}
        )
    if files and files.status_code == 200:
        record_counter("transfertool.fts3.%s.query_details.success" % __extract_host(transfer_host))
        return files.json()

    record_counter("transfertool.fts3.%s.query_details.failure" % __extract_host(transfer_host))
    return
Example #16
0
    def on_message(self, headers, message):
        record_counter('daemons.cache.consumer2.message')

        msg = json.loads(message)
        id = msg['id']
        if id % self.__num_thread == self.__id:
            self.message_handle(msg['payload'])
Example #17
0
def consumer(id, total_threads=1):
    """
    Main loop to consume messages from the FTS3 producer.
    """

    logging.info('consumer starting')

    brokers_alias = []
    brokers_resolved = []
    try:
        brokers_alias = [b.strip() for b in config_get('messaging-fts3', 'brokers').split(',')]
    except:
        raise Exception('Could not load brokers from configuration')

    logging.info('resolving broker dns alias: %s' % brokers_alias)

    brokers_resolved = []
    for broker in brokers_alias:
        brokers_resolved.append([str(tmp_broker) for tmp_broker in dns.resolver.query(broker, 'A')])
    brokers_resolved = [item for sublist in brokers_resolved for item in sublist]

    logging.debug('brokers resolved to %s', brokers_resolved)

    conns = []
    for broker in brokers_resolved:
        conns.append(stomp.Connection(host_and_ports=[(broker, config_get_int('messaging-fts3', 'port'))],
                                      use_ssl=True,
                                      ssl_key_file=config_get('messaging-fts3', 'ssl_key_file'),
                                      ssl_cert_file=config_get('messaging-fts3', 'ssl_cert_file'),
                                      ssl_version=ssl.PROTOCOL_TLSv1))

    logging.info('consumer started')

    while not graceful_stop.is_set():

        for conn in conns:

            if not conn.is_connected():
                logging.info('connecting to %s' % conn.transport._Transport__host_and_ports[0][0])
                record_counter('daemons.messaging.fts3.reconnect.%s' % conn.transport._Transport__host_and_ports[0][0].split('.')[0])

                conn.set_listener('rucio-messaging-fts3', Consumer(broker=conn.transport._Transport__host_and_ports[0], id=id, total_threads=total_threads))
                conn.start()
                conn.connect()
                conn.subscribe(destination=config_get('messaging-fts3', 'destination'),
                               id='rucio-messaging-fts3',
                               ack='auto')

        time.sleep(1)

    logging.info('graceful stop requested')

    for conn in conns:
        try:
            conn.disconnect()
        except:
            pass

    logging.info('graceful stop done')
Example #18
0
def submit_deletion(url, session=None):
    """
    Submit a deletion request to a deletiontool.

    :param url: URL acceptable to deletiontool as a string.
    :param session: Database sesssion to use.
    :returns: Deletiontool external ID.
    """

    record_counter('core.request.submit_deletion')
Example #19
0
def list_all(session):
    """
    List all transfer jobs.

    :returns: List of dictionaries with job information
    """

    record_counter('daemons.mock.fts3.list_all')

    query = session.query(test_models.MockFTSTransfer).order_by(test_models.MockFTSTransfer.lastmodified.desc())
    for row in query.yield_per(5):
        yield row
Example #20
0
def bulk_query_requests(request_host, request_ids, transfertool='fts3', session=None):
    """
    Query the status of a request.

    :param request_host: Name of the external host.
    :param request_ids: List of (Request-ID as a 32 character hex string, External-ID as a 32 character hex string)
    :param transfertool: Transfertool name as a string.
    :param session: Database session to use.
    :returns: Request status information as a dictionary.
    """

    record_counter('core.request.query_request')

    transfer_ids = []
    for request_id, external_id in request_ids:
        if external_id not in transfer_ids:
            transfer_ids.append(external_id)

    if transfertool == 'fts3':
        try:
            ts = time.time()
            fts_resps = fts3.bulk_query(transfer_ids, request_host)
            record_timer('core.request.query_bulk_request_fts3', (time.time() - ts) * 1000 / len(transfer_ids))
        except Exception:
            raise

        responses = {}
        for request_id, external_id in request_ids:
            fts_resp = fts_resps[external_id]
            if not fts_resp:
                req_status = {}
                req_status['new_state'] = RequestState.LOST
                req_status['request_id'] = request_id
            elif isinstance(fts_resp, Exception):
                req_status = fts_resp
            else:
                req_status = fts_resp
                # needed for unfinished jobs
                req_status['request_id'] = request_id

                if req_status['job_state'] in (str(FTSState.FAILED),
                                               str(FTSState.FINISHEDDIRTY),
                                               str(FTSState.CANCELED)):
                    req_status['new_state'] = RequestState.FAILED
                elif req_status['job_state'] == str(FTSState.FINISHED):
                    req_status['new_state'] = RequestState.DONE

            responses[request_id] = req_status
        return responses
    else:
        raise NotImplementedError

    return None
Example #21
0
    def make_replicas_available(self):
        """
        Marks available replicas for the dataset at rse if they are in PhEDEx
        """

        with monitor.record_timer_block('cms_sync.time_recover_replica'):
            logging.info('Recovering unavailable replicas for %s:%s at %s',
                         self.scope, self.block_name, self.rse)

            replicas = list_replicas(dids=[{
                'scope': self.scope,
                'name': self.block_name
            }],
                                     rse_expression='rse=%s' % self.rse,
                                     all_states=True)

            try:
                unavailable_replicas = {
                    repl['name']
                    for repl in replicas
                    if repl['states'][self.rse] != 'AVAILABLE'
                }
            except TypeError:
                unavailable_replicas = set()

            phedex_replicas = set(self.replicas.keys())
            missing = list(phedex_replicas & unavailable_replicas)

            logging.info(
                'Recovery for %s:%s at %s: PhEDEx has %s, Rucio unavailable %s. Missing: %s ',
                self.scope, self.block_name, self.rse, len(phedex_replicas),
                len(unavailable_replicas), len(missing))

            # Fix up things which are unavailable
            rse_details = get_rse(self.rse)
            rse_id = rse_details['id']
            scope = InternalScope(self.scope)
            state = 'A'

            for name in missing:
                logging.info('Setting available %s:%s at %s', self.scope, name,
                             self.rse)
                core_update_state(rse_id=rse_id,
                                  scope=scope,
                                  name=name,
                                  state=state)

            monitor.record_counter('cms_sync.files_made_available',
                                   delta=len(missing))

        return
Example #22
0
def bulk_submit_xfer(submitjob, recursive=False):
    cfg = load_config()
    client_id = cfg['globus']['apps'][GLOBUS_AUTH_APP]['client_id']
    auth_client = NativeAppAuthClient(client_id)
    refresh_token = cfg['globus']['apps'][GLOBUS_AUTH_APP]['refresh_token']
    source_endpoint_id = submitjob[0].get('metadata').get(
        'source_globus_endpoint_id')
    destination_endpoint_id = submitjob[0].get('metadata').get(
        'dest_globus_endpoint_id')
    authorizer = RefreshTokenAuthorizer(refresh_token=refresh_token,
                                        auth_client=auth_client)
    tc = TransferClient(authorizer=authorizer)

    # make job_label for task a timestamp
    now = datetime.datetime.now()
    job_label = now.strftime('%Y%m%d%H%M%s')

    # retrieve globus_task_deadline value to enforce time window to complete transfers
    # default is 2880 minutes or 48 hours
    globus_task_deadline = config_get_int('conveyor', 'globus_task_deadline',
                                          False, 2880)
    deadline = now + datetime.timedelta(minutes=globus_task_deadline)

    # from Globus... sync_level=checksum means that before files are transferred, Globus will compute checksums on the source
    # and destination files, and only transfer files that have different checksums are transferred. verify_checksum=True means
    # that after a file is transferred, Globus will compute checksums on the source and destination files to verify that the
    # file was transferred correctly.  If the checksums do not match, it will redo the transfer of that file.
    tdata = TransferData(tc,
                         source_endpoint_id,
                         destination_endpoint_id,
                         label=job_label,
                         sync_level="checksum",
                         deadline=str(deadline))

    for file in submitjob:
        source_path = file.get('sources')[0]
        dest_path = file.get('destinations')[0]
        filesize = file['metadata']['filesize']
        # TODO: support passing a recursive parameter to Globus
        # md5 = file['metadata']['md5']
        # tdata.add_item(source_path, dest_path, recursive=False, external_checksum=md5)
        tdata.add_item(source_path, dest_path, recursive=False)
        record_counter(
            'daemons.conveyor.transfer_submitter.globus.transfers.submit.filesize',
            filesize)

    # logging.info('submitting transfer...')
    transfer_result = tc.submit_transfer(tdata)
    # logging.info("task_id =", transfer_result["task_id"])

    return transfer_result["task_id"]
Example #23
0
    def query_latest(self, state, last_nhours=1):
        """
        Query the latest status transfers status in FTS3 via JSON.

        :param state: Transfer state as a string or a dictionary.
        :returns: Transfer status information as a dictionary.
        """

        jobs = None

        try:
            whoami = requests.get('%s/whoami' % (self.external_host),
                                  verify=self.verify,
                                  cert=self.cert,
                                  headers={'Content-Type': 'application/json'})
            if whoami and whoami.status_code == 200:
                delegation_id = whoami.json()['delegation_id']
            else:
                raise Exception('Could not retrieve delegation id: %s',
                                whoami.content)
            state_string = ','.join(state)
            jobs = requests.get(
                '%s/jobs?dlg_id=%s&state_in=%s&time_window=%s' %
                (self.external_host, delegation_id, state_string, last_nhours),
                verify=self.verify,
                cert=self.cert,
                headers={'Content-Type': 'application/json'})
        except ReadTimeout as error:
            raise TransferToolTimeout(error)
        except JSONDecodeError as error:
            raise TransferToolWrongAnswer(error)
        except Exception:
            logging.warn('Could not query latest terminal states from %s',
                         self.external_host)

        if jobs and (jobs.status_code == 200 or jobs.status_code == 207):
            record_counter('transfertool.fts3.%s.query_latest.success' %
                           self.__extract_host(self.external_host))
            try:
                jobs_json = jobs.json()
                return jobs_json
            except ReadTimeout as error:
                raise TransferToolTimeout(error)
            except JSONDecodeError as error:
                raise TransferToolWrongAnswer(error)
            except Exception as error:
                logging.error("Failed to parse the jobs status %s" %
                              (str(error)))

        record_counter('transfertool.fts3.%s.query.failure' %
                       self.__extract_host(self.external_host))
Example #24
0
    def update_replicas(self):
        """
        Add or removes replicas for the dataset at rse.
        """

        with monitor.record_timer_block('cms_sync.time_update_replica'):
            logging.info('Updating replicas for %s:%s at %s', self.scope,
                         self.block_name, self.rse)
            replicas = list_replicas(dids=[{
                'scope': self.scope,
                'name': self.block_name
            }],
                                     rse_expression='rse=%s' % self.rse)
            try:
                rucio_replicas = {repl['name'] for repl in replicas}
            except TypeError:
                rucio_replicas = set()

            phedex_replicas = set(self.replicas.keys())
            missing = list(phedex_replicas - rucio_replicas)
            to_remove = list(rucio_replicas - phedex_replicas)

            if missing and (len(phedex_replicas) != len(missing)):
                logging.warn(
                    'Recovery: Inconsistency found for %s at %s: %s in PhEDEx and %s missing',
                    self.rse, self.block_name, len(phedex_replicas),
                    len(missing))

            if missing:
                logging.info('Some or all replicas for %s at %s missing',
                             self.rse, self.block_name)
                lfns_added = self.add_missing_replicas(missing)
                monitor.record_counter('cms_sync.files_added',
                                       delta=lfns_added)
            if to_remove:
                logging.info('Removing replicas for %s at %s', self.rse,
                             self.block_name)

                lfns_removed = self.remove_extra_replicas(to_remove)
                monitor.record_counter('cms_sync.files_removed',
                                       delta=lfns_removed)

            if not missing and not to_remove:
                logging.warn('Something very off for %s at %s', self.rse,
                             self.block_name)
                logging.warn('Phedex: %s', phedex_replicas)
                logging.warn('Rucio: %s', rucio_replicas)
                logging.warn('Missing: %s', missing)
                logging.warn('To remove: %s', to_remove)

        return
Example #25
0
def necromancer(thread=0, bulk=5, once=False):
    """
    Creates a Necromancer Worker that gets a list of bad replicas for a given hash,
    identify lost DIDs and for non-lost ones, set the locks and rules for reevaluation.

    :param thread: Thread number at startup.
    :param bulk: The number of requests to process.
    :param once: Run only once.
    """

    sleep_time = 60
    update_history_threshold = 3600
    update_history_time = time.time()

    executable = ' '.join(argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)

    while not graceful_stop.is_set():

        hb = heartbeat.live(executable, hostname, pid, hb_thread)
        prepend_str = 'Thread [%i/%i] : ' % (hb['assign_thread'] + 1, hb['nr_threads'])

        stime = time.time()
        try:
            replicas = list_bad_replicas(limit=bulk, thread=hb['assign_thread'], total_threads=hb['nr_threads'])

            for replica in replicas:
                scope, name, rse_id, rse = replica['scope'], replica['name'], replica['rse_id'], replica['rse']
                logging.info(prepend_str + 'Working on %s:%s on %s' % (scope, name, rse))

                rep = [r for r in list_replicas([{'scope': scope, 'name': name}, ])]
                if (not rep[0]['rses']) or (rep[0]['rses'].keys() == [rse]):
                    logging.info(prepend_str + 'File %s:%s has no other replicas, it will be marked as lost' % (scope, name))
                    try:
                        update_rules_for_lost_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.lostfile', delta=1)
                    except DatabaseException, error:
                        logging.info(prepend_str + '%s' % (str(error)))

                else:
                    logging.info(prepend_str + 'File %s:%s can be recovered. Available sources : %s' % (scope, name, str(rep[0]['rses'])))
                    try:
                        update_rules_for_bad_replica(scope=scope, name=name, rse_id=rse_id, nowait=True)
                        monitor.record_counter(counters='necromancer.badfiles.recovering', delta=1)
                    except DatabaseException, error:
                        logging.info(prepend_str + '%s' % (str(error)))

            logging.info(prepend_str + 'It took %s seconds to process %s replicas' % (str(time.time() - stime), str(len(replicas))))
Example #26
0
def touch_request(request_id, session=None):
    """
    Update the timestamp of a request. Fails silently if the request_id does not exist.

    :param request_id: Request-ID as a 32 character hex string.
    :param session: Database session to use.
    """

    record_counter('core.request.touch_request')

    try:
        rowcount = session.query(models.Request).filter_by(id=request_id).update({'updated_at': datetime.datetime.utcnow()}, synchronize_session=False)
    except IntegrityError, e:
        raise RucioException(e.args)
Example #27
0
def __release_all_activities(stats, direction, rse_name, rse_id):
    """
    Release requests if activities should be ignored.

    :param stats:          Request statistics
    :param direction:      String whether request statistics are based on source or destination RSEs.
    :param rse_name:       RSE name.
    :param rse_id:         RSE id.
    """
    threshold = stats['threshold']
    transfer = stats['transfer']
    waiting = stats['waiting']
    strategy = stats['strategy']
    if threshold is not None and transfer + waiting > threshold:
        record_gauge(
            'daemons.conveyor.throttler.set_rse_transfer_limits.%s.max_transfers'
            % (rse_name), threshold)
        record_gauge(
            'daemons.conveyor.throttler.set_rse_transfer_limits.%s.transfers' %
            (rse_name), transfer)
        record_gauge(
            'daemons.conveyor.throttler.set_rse_transfer_limits.%s.waitings' %
            (rse_name), waiting)
        if transfer < 0.8 * threshold:
            to_be_released = threshold - transfer
            if strategy == 'grouped_fifo':
                deadline = stats.get('deadline')
                volume = stats.get('volume')
                release_waiting_requests_grouped_fifo(rse_id,
                                                      count=to_be_released,
                                                      direction=direction,
                                                      volume=volume,
                                                      deadline=deadline)
            elif strategy == 'fifo':
                release_waiting_requests_fifo(rse_id,
                                              count=to_be_released,
                                              direction=direction)
        else:
            logging.debug(
                "Throttler has done nothing on rse %s (transfer > 0.8 * threshold)"
                % rse_name)
    elif waiting > 0 or not threshold:
        logging.debug(
            "Throttler remove limits(threshold: %s) and release all waiting requests, rse %s"
            % (threshold, rse_name))
        delete_rse_transfer_limits(rse_id, activity='all_activities')
        release_all_waiting_requests(rse_id, direction=direction)
        record_counter(
            'daemons.conveyor.throttler.delete_rse_transfer_limits.%s' %
            (rse_name))
Example #28
0
def cancel(tid, session):
    """
    Kills a transfer by setting its state to CANCELLED.

    :param tid: The transfer job id.
    """

    record_counter('daemons.mock.fts3.cancel')

    ts = time.time()
    query = session.query(test_models.MockFTSTransfer).filter(tid=tid)
    query.update({'state': FTSState.CANCELED,
                  'last_modified': datetime.datetime.utcnow()})
    record_timer('daemons.mock.fts3.cancel.update_state', (time.time()-ts)*1000)
Example #29
0
def trace(payload):
    """
    Write a trace to log file and send it to active mq.

    :param payload: Python dictionary with trace report.
    """

    record_counter('trace.trace')
    report = json.dumps(payload, default=date_handler)
    LOGGER.debug(report)

    t_conns = CONNS[:]

    try:
        for i in range(len(t_conns)):
            try:
                conn = random.sample(t_conns, 1)[0]
                if not conn.is_connected():
                    logging.info(
                        'reconnect to ' +
                        conn.transport._Transport__host_and_ports[0][0])
                    conn.start()
                    conn.connect(USERNAME, PASSWORD)
            except stomp.exception.NotConnectedException as error:
                logging.warn(
                    'Could not connect to broker %s, try another one' %
                    conn.transport._Transport__host_and_ports[0][0])
                t_conns.remove(conn)
                continue
            except stomp.exception.ConnectFailedException as error:
                logging.warn(
                    'Could not connect to broker %s, try another one' %
                    conn.transport._Transport__host_and_ports[0][0])
                t_conns.remove(conn)
                continue

        if conn.is_connected:
            conn.send(body=report,
                      destination=TOPIC,
                      headers={
                          'persistent': 'true',
                          'appversion': 'rucio'
                      })
        else:
            logging.error(
                "Unable to connect to broker. Could not send trace: %s" %
                report)
    except Exception as error:
        logging.error(error)
Example #30
0
File: fts3.py Project: zzaiin/Rucio
def get_jobs_response(transfer_host, fts_session, jobs_response):
    """
    Parse FTS bulk query response and query details for finished jobs.

    :param transfer_host: FTS server as a string.
    :fts_session: query request as a session.
    :jobs_response: FTS bulk query response as a dict.
    :returns: Transfer status information as a dictionary.
    """

    responses = {}
    for job_response in jobs_response:
        transfer_id = job_response['job_id']
        if job_response['http_status'] == "404 Not Found":
            responses[transfer_id] = None
        elif job_response['http_status'] == "200 Ok":
            if not job_response['job_state'] in (str(
                    FTSState.FAILED), str(
                        FTSState.FINISHEDDIRTY), str(
                            FTSState.CANCELED), str(FTSState.FINISHED)):
                responses[transfer_id] = {}
                responses[transfer_id]['job_state'] = job_response['job_state']
                responses[transfer_id]['new_state'] = None
                responses[transfer_id]['transfer_id'] = transfer_id
            else:
                if transfer_host.startswith("https"):
                    files = fts_session.get(
                        '%s/jobs/%s/files' % (transfer_host, transfer_id),
                        verify=False,
                        cert=(__USERCERT, __USERCERT),
                        headers={'Content-Type': 'application/json'})
                else:
                    files = fts_session.get(
                        '%s/jobs/%s/files' % (transfer_host, transfer_id),
                        headers={'Content-Type': 'application/json'})
                if files and (files.status_code == 200
                              or files.status_code == 207):
                    record_counter(
                        'transfertool.fts3.%s.jobs_response.success' %
                        __extract_host(transfer_host))
                    responses[transfer_id] = format_response(
                        transfer_host, job_response, files.json())
                else:
                    record_counter(
                        'transfertool.fts3.%s.jobs_response.failure' %
                        __extract_host(transfer_host))
                    responses[transfer_id] = Exception(
                        'Could not retrieve files information: %s', files)
    return responses
Example #31
0
def set_external_host(request_id, external_host, session=None):
    """
    Update the state of a request. Fails silently if the request_id does not exist.

    :param request_id: Request-ID as a 32 character hex string.
    :param external_host: Selected external host as string in format protocol://fqdn:port
    :param session: Database session to use.
    """

    record_counter('core.request.set_external_host')

    try:
        session.query(models.Request).filter_by(id=request_id).update({'external_host': external_host}, synchronize_session=False)
    except IntegrityError, e:
        raise RucioException(e.args)
Example #32
0
def set_request_state(request_id, new_state, session=None):
    """
    Update the state of a request. Fails silently if the request_id does not exist.

    :param request_id: Request-ID as a 32 character hex string.
    :param new_state: New state as string.
    :param session: Database session to use.
    """

    record_counter('core.request.set_request_state')

    try:
        session.query(models.Request).filter_by(id=request_id).update({'state': new_state}, synchronize_session=False)
    except IntegrityError, e:
        raise RucioException(e.args)
Example #33
0
    def on_message(self, headers, message):
        record_counter('daemons.cache.consumer2.message')
#        id = msg['id']
#        if id % self.__num_thread == self.__id:
#            self.message_handle(msg['payload'])
        try:
            msg = json.loads(message)
            if isinstance(msg, dict) and 'operation' in msg.keys():
                if msg['operation'] == 'add_replicas':
                    logging.info('add_replicas to RSE %s: %s ' % (msg['rse'], str(msg['files'])))
                    add_volatile_replicas(rse=msg['rse'], replicas=msg['files'])
                elif msg['operation'] == 'delete_replicas':
                    logging.info('delete_replicas to RSE %s: %s ' % (msg['rse'], str(msg['files'])))
                    delete_volatile_replicas(rse=msg['rse'], replicas=msg['files'])
        except:
            logging.error(str(format_exc()))
Example #34
0
    def update_rule(self):
        """
        Adds or removes the rule for the block.
        """

        rules = list_replication_rules(filters={
            'scope': self.scope,
            'name': self.block_name
        })
        # rules = self.rcli.list_did_rules(scope=self.scope, name=self.block_name)
        rse_expression = 'rse=' + self.rse

        remove_rules = [
            rule for rule in rules if rule['account'] == self.account
            and rule['rse_expression'] == rse_expression
        ]

        if not remove_rules and self.is_at_pnn:
            self.rule_exists = False
            if self.dry_run:
                logging.info("Dry run: Adding rule for dataset %s at rse %s.",
                             self.block_name, self.rse)
            else:
                self.add_replication_rule_with_defaults(
                    dids=[{
                        'scope': self.scope,
                        'name': self.block_name
                    }],
                    copies=1,
                    rse_expression=rse_expression,
                    account=self.account)
                monitor.record_counter('cms_sync.rules_added')
            self.rule_exists = True
        elif remove_rules and not self.is_at_pnn:
            self.rule_exists = True
            if self.dry_run:
                logging.info("Removing rules for dataset %s at rse %s.",
                             self.block_name, self.rse)
            else:
                for rule in remove_rules:
                    # delete_replication_rule(rule['id'], purge_replicas=False, issuer=self.account)
                    delete_rule(rule_id=rule['id'],
                                purge_replicas=True,
                                soft=False)
                    monitor.record_counter('cms_sync.rules_removed')
            self.rule_exists = False
Example #35
0
    def register_container(self):
        self.container_exists = False
        if self.dry_run:
            logging.info('Dry Run: Create container %s in scope %s.',
                         self.container, self.scope)
            self.container_exists = True
            return self.container_exists

        existed, created, attached, already_attached = self.register_and_attach_did(
            scope=self.scope, name=self.container, did_type='CONTAINER')
        self.container_exists = existed | created
        if existed:
            monitor.record_counter('cms_sync.container_exists')
        if created:
            monitor.record_counter('cms_sync.container_created')

        return self.container_exists
Example #36
0
def get_jobs_response(transfer_host, fts_session, jobs_response):
    """
    Parse FTS bulk query response and query details for finished jobs.

    :param transfer_host: FTS server as a string.
    :fts_session: query request as a session.
    :jobs_response: FTS bulk query response as a dict.
    :returns: Transfer status information as a dictionary.
    """

    responses = {}
    for job_response in jobs_response:
        transfer_id = job_response["job_id"]
        if job_response["http_status"] == "404 Not Found":
            responses[transfer_id] = None
        elif job_response["http_status"] == "200 Ok":
            if not job_response["job_state"] in (
                str(FTSState.FAILED),
                str(FTSState.FINISHEDDIRTY),
                str(FTSState.CANCELED),
                str(FTSState.FINISHED),
            ):
                responses[transfer_id] = {}
                responses[transfer_id]["job_state"] = job_response["job_state"]
                responses[transfer_id]["new_state"] = None
                responses[transfer_id]["transfer_id"] = transfer_id
            else:
                if transfer_host.startswith("https"):
                    files = fts_session.get(
                        "%s/jobs/%s/files" % (transfer_host, transfer_id),
                        verify=False,
                        cert=(__USERCERT, __USERCERT),
                        headers={"Content-Type": "application/json"},
                    )
                else:
                    files = fts_session.get(
                        "%s/jobs/%s/files" % (transfer_host, transfer_id), headers={"Content-Type": "application/json"}
                    )
                if files and files.status_code == 200:
                    record_counter("transfertool.fts3.%s.jobs_response.success" % __extract_host(transfer_host))
                    responses[transfer_id] = format_response(transfer_host, job_response, files.json())
                else:
                    record_counter("transfertool.fts3.%s.jobs_response.failure" % __extract_host(transfer_host))
                    responses[transfer_id] = Exception("Could not retrieve files information: %s", files)
    return responses
Example #37
0
def undertaker(worker_number=1, total_workers=1, chunk_size=5, once=False):
    """
    Main loop to select and delete dids.
    """
    logging.info('Undertaker(%s): starting', worker_number)
    logging.info('Undertaker(%s): started', worker_number)
    hostname = socket.gethostname()
    pid = os.getpid()
    thread = threading.current_thread()
    sanity_check(executable='rucio-undertaker', hostname=hostname)
    while not GRACEFUL_STOP.is_set():
        try:
            heartbeat = live(executable='rucio-undertaker',
                             hostname=hostname,
                             pid=pid,
                             thread=thread,
                             older_than=6000)
            logging.info(
                'Undertaker({0[worker_number]}/{0[total_workers]}): Live gives {0[heartbeat]}'
                .format(locals()))

            dids = list_expired_dids(worker_number=heartbeat['assign_thread'] +
                                     1,
                                     total_workers=heartbeat['nr_threads'],
                                     limit=10000)
            if not dids and not once:
                logging.info('Undertaker(%s): Nothing to do. sleep 60.',
                             worker_number)
                time.sleep(60)
                continue

            for chunk in chunks(dids, chunk_size):
                try:
                    logging.info('Undertaker(%s): Receive %s dids to delete',
                                 worker_number, len(chunk))
                    delete_dids(dids=chunk, account='root')
                    logging.info('Undertaker(%s): Delete %s dids',
                                 worker_number, len(chunk))
                    record_counter(counters='undertaker.delete_dids',
                                   delta=len(chunk))
                except RuleNotFound, error:
                    logging.error(error)
                except DatabaseException, error:
                    logging.error('Undertaker(%s): Got database error %s.',
                                  worker_number, str(error))
Example #38
0
def query_request(request_id, transfertool='fts3', session=None):
    """
    Query the status of a request.

    :param request_id: Request-ID as a 32 character hex string.
    :param transfertool: Transfertool name as a string.
    :param session: Database session to use.
    :returns: Request status information as a dictionary.
    """

    record_counter('core.request.query_request')

    req = get_request(request_id, session=session)

    req_status = {'request_id': request_id,
                  'new_state': None}

    if not req:
        req_status['new_state'] = RequestState.LOST
        return req_status

    if transfertool == 'fts3':
        try:
            ts = time.time()
            response = fts3.query(req['external_id'], req['external_host'])
            record_timer('core.request.query_request_fts3', (time.time() - ts) * 1000)
            req_status['details'] = response
        except Exception:
            raise

        if not response:
            req_status['new_state'] = RequestState.LOST
        else:
            if 'job_state' not in response:
                req_status['new_state'] = RequestState.LOST
            elif response['job_state'] in (str(FTSState.FAILED),
                                           str(FTSState.FINISHEDDIRTY),
                                           str(FTSState.CANCELED)):
                req_status['new_state'] = RequestState.FAILED
            elif response['job_state'] == str(FTSState.FINISHED):
                req_status['new_state'] = RequestState.DONE
    else:
        raise NotImplementedError

    return req_status
Example #39
0
    def LIST_DIDS_WILDCARD(self, scope, wildcard):
        jdoe_account = 'jdoe'
        client = DIDClient(account=jdoe_account)

        print 'run with: ' + str(wildcard)
        start = time()
        with monitor.record_timer_block('jdoe.list_dids_wildcard'):
            dids = [did for did in client.list_dids(scope=scope, filters=wildcard, type='dataset')]

        duration = time() - start
        cnt = len(dids)
        print 'got %d dids' % cnt

        monitor.record_counter('jdoe.list_dids_wildcard.num_results', cnt)
        if cnt != 0:
            monitor.record_counter('jdoe.list_dids_wildcard.time_per_did', duration / cnt)

        return {'no_datasets': cnt}
Example #40
0
File: fts3.py Project: zzaiin/Rucio
def new_bulk_query(transfer_ids, transfer_host):
    """
    Query the status of a bulk of transfers in FTS3 via JSON.

    :param transfer_ids: FTS transfer identifiers as a list.
    :param transfer_host: FTS server as a string.
    :returns: Transfer status information as a dictionary.
    """

    responses = {}
    if transfer_host.startswith('https://'):
        fts_session = requests.Session()
        jobs = fts_session.get('%s/jobs/%s' %
                               (transfer_host, ','.join(transfer_ids)),
                               verify=False,
                               cert=(__USERCERT, __USERCERT),
                               headers={'Content-Type': 'application/json'})
        if jobs and (jobs.status_code == 200 or jobs.status_code == 207):
            record_counter('transfertool.fts3.%s.new_bulk.success' %
                           __extract_host(transfer_host))
            jobs_response = jobs.json()
            responses = get_jobs_response(transfer_host, fts_session,
                                          jobs_response)
            for transfer_id in transfer_ids:
                if transfer_id not in responses.keys():
                    responses[transfer_id] = None
        else:
            record_counter('transfertool.fts3.%s.new_bulk.failure' %
                           __extract_host(transfer_host))
            for transfer_id in transfer_ids:
                responses[transfer_id] = Exception(
                    'Could not retrieve transfer information: %s' % jobs)
    else:
        fts_session = requests.Session()
        jobs = fts_session.get('%s/jobs/%s' % (transfer_host, transfer_id),
                               headers={'Content-Type': 'application/json'})
        if jobs and (jobs.status_code == 200 or jobs.status_code == 207):
            record_counter('transfertool.fts3.%s.new_bulk.success' %
                           __extract_host(transfer_host))
            jobs_response = jobs.json()
            responses = get_jobs_response(transfer_host, fts_session,
                                          jobs_response)
            for transfer_id in transfer_ids:
                if transfer_id not in responses.keys():
                    responses[transfer_id] = None
        else:
            record_counter('transfertool.fts3.%s.new_bulk.failure' %
                           __extract_host(transfer_host))
            for transfer_id in transfer_ids:
                responses[transfer_id] = Exception(
                    'Could not retrieve transfer information: %s' % jobs)

    return responses
Example #41
0
def set_transfer_update_time(external_host, transfer_id, update_time=datetime.datetime.utcnow(), session=None):
    """
    Update the state of a request. Fails silently if the transfer_id does not exist.

    :param external_host:  Selected external host as string in format protocol://fqdn:port
    :param transfer_id:    External transfer job id as a string.
    :param update_time:    Time stamp.
    :param session:        Database session to use.
    """

    record_counter('core.request.set_transfer_update_time')

    try:
        rowcount = session.query(models.Request).filter_by(external_id=transfer_id, state=RequestState.SUBMITTED).update({'updated_at': update_time}, synchronize_session=False)
    except IntegrityError as error:
        raise RucioException(error.args)

    if not rowcount:
        raise UnsupportedOperation("Transfer %s doesn't exist or its status is not submitted." % (transfer_id))
Example #42
0
def __set_transfer_state(external_host, transfer_id, new_state, session=None):
    """
    Update the state of a transfer. Fails silently if the transfer_id does not exist.

    :param external_host:  Selected external host as string in format protocol://fqdn:port
    :param transfer_id:    External transfer job id as a string.
    :param new_state:      New state as string.
    :param session:        Database session to use.
    """

    record_counter('core.request.set_transfer_state')

    try:
        rowcount = session.query(models.Request).filter_by(external_id=transfer_id).update({'state': new_state, 'updated_at': datetime.datetime.utcnow()}, synchronize_session=False)
    except IntegrityError as error:
        raise RucioException(error.args)

    if not rowcount:
        raise UnsupportedOperation("Transfer %s on %s state %s cannot be updated." % (transfer_id, external_host, new_state))
Example #43
0
    def on_message(self, frame):
        record_counter('daemons.conveyor.receiver.message_all')

        msg = json.loads(frame.body)

        if not self.__all_vos:
            if 'vo' not in msg or msg['vo'] != get_policy():
                return

        if 'job_metadata' in msg.keys() \
           and isinstance(msg['job_metadata'], dict) \
           and 'issuer' in msg['job_metadata'].keys() \
           and str(msg['job_metadata']['issuer']) == str('rucio'):

            if 'job_state' in msg.keys() and str(
                    msg['job_state']) != str('ACTIVE'):
                record_counter('daemons.conveyor.receiver.message_rucio')

                self._perform_request_update(msg)
Example #44
0
    def query(self, transfer_ids, details=False, timeout=None):
        """
        Query the status of a transfer in FTS3 via JSON.

        :param transfer_ids: FTS transfer identifiers as list of strings.
        :param details:      Switch if detailed information should be listed.
        :param timeout:      Timeout in seconds.
        :returns:            Transfer status information as a list of dictionaries.
        """

        if len(transfer_ids) > 1:
            raise NotImplementedError('FTS3 transfertool query not bulk ready')

        transfer_id = transfer_ids[0]
        if details:
            return self.__query_details(transfer_id=transfer_id)

        job = None

        job = requests.get('%s/jobs/%s' % (self.external_host, transfer_id),
                           verify=self.verify,
                           cert=self.cert,
                           headers=self.headers,
                           timeout=timeout)  # TODO Set to 5 in conveyor
        if job and job.status_code == 200:
            record_counter('transfertool.fts3.%s.query.success' %
                           self.__extract_host(self.external_host))
            labels = {
                'state': 'success',
                'host': self.__extract_host(self.external_host)
            }
            QUERY_COUNTER.labels(**labels).inc()
            return [job.json()]

        record_counter('transfertool.fts3.%s.query.failure' %
                       self.__extract_host(self.external_host))
        labels = {
            'state': 'failure',
            'host': self.__extract_host(self.external_host)
        }
        QUERY_COUNTER.labels(**labels).inc()
        raise Exception('Could not retrieve transfer information: %s',
                        job.content)
Example #45
0
def bulk_check_xfers(task_ids):
    tc = get_transfer_client()

    logging.debug('task_ids: %s' % task_ids)

    responses = {}

    for task_id in task_ids:
        transfer = tc.get_task(str(task_id))
        logging.debug('transfer: %s' % transfer)
        status = str(transfer["status"])
        if status == 'SUCCEEDED':
            record_counter('daemons.conveyor.transfer_submitter.globus.transfers.bytes_transferred', transfer['bytes_transferred'])
            record_counter('daemons.conveyor.transfer_submitter.globus.transfers.effective_bytes_per_second', transfer['effective_bytes_per_second'])
        responses[str(task_id)] = status

    logging.debug('responses: %s' % responses)

    return responses
Example #46
0
def touch_transfer(external_host, transfer_id, session=None):
    """
    Update the timestamp of requests in a transfer. Fails silently if the transfer_id does not exist.

    :param request_host:   Name of the external host.
    :param transfer_id:    External transfer job id as a string.
    :param session:        Database session to use.
    """

    record_counter('core.request.touch_transfer')

    try:
        # don't touch it if it's already touched in 30 seconds
        session.query(models.Request).with_hint(models.Request, "INDEX(REQUESTS REQUESTS_EXTERNALID_UQ)", 'oracle')\
                                     .filter_by(external_id=transfer_id)\
                                     .filter(models.Request.state == RequestState.SUBMITTED)\
                                     .filter(models.Request.updated_at < datetime.datetime.utcnow() - datetime.timedelta(seconds=30))\
                                     .update({'updated_at': datetime.datetime.utcnow()}, synchronize_session=False)
    except IntegrityError as error:
        raise RucioException(error.args)
Example #47
0
def submit(tinfo, session):
    """
    Create a new transfer job in state QUEUED.

    :param tinfo: The transfer job information as a string.
    :returns: The transfer job id.
    """

    record_counter('daemons.mock.fts3.submit')

    ts = time.time()
    tid = generate_uuid()
    record_timer('daemons.mock.fts3.submit.000-generate_uuid', (time.time()-ts)*1000)

    ts = time.time()
    new_transfer = test_models.MockFTSTransfer(transfer_id=tid, transfer_metadata=str(tinfo))
    new_transfer.save(session=session)
    record_timer('daemons.mock.fts3.submit.001-new_transfer', (time.time()-ts)*1000)

    return {'job_id': tid}
Example #48
0
def trace(payload):
    """
    Write a trace to log file and send it to active mq.

    :param payload: Python dictionary with trace report.
    """

    record_counter('trace.trace')
    report = json.dumps(payload, default=date_handler)
    logger.info(report)

    try:
        conn = random.sample(conns, 1)[0]
        if not conn.is_connected():
            logging.info('reconnect to ' + conn.transport._Transport__host_and_ports[0][0])
            conn.start()
            conn.connect(username, password)
        conn.send(body=report, destination=topic, headers={'persistent': 'true', 'appversion': 'rucio'})
    except Exception, e:
        errlog.error(e)
Example #49
0
    def whoami(self):
        """
        Returns credential information from the FTS3 server.

        :returns: Credentials as stored by the FTS3 server as a dictionary.
        """

        get_result = None

        get_result = requests.get('%s/whoami' % self.external_host,
                                  verify=self.verify,
                                  cert=self.cert,
                                  headers={'Content-Type': 'application/json'})

        if get_result and get_result.status_code == 200:
            record_counter('transfertool.fts3.%s.whoami.success' % self.__extract_host(self.external_host))
            return get_result.json()

        record_counter('transfertool.fts3.%s.whoami.failure' % self.__extract_host(self.external_host))
        raise Exception('Could not retrieve credentials: %s', get_result.content)
Example #50
0
    def version(self):
        """
        Returns FTS3 server information.

        :returns: FTS3 server information as a dictionary.
        """

        get_result = None

        get_result = requests.get('%s/' % self.external_host,
                                  verify=self.verify,
                                  cert=self.cert,
                                  headers={'Content-Type': 'application/json'})

        if get_result and get_result.status_code == 200:
            record_counter('transfertool.fts3.%s.version.success' % self.__extract_host(self.external_host))
            return get_result.json()

        record_counter('transfertool.fts3.%s.version.failure' % self.__extract_host(self.external_host))
        raise Exception('Could not retrieve version: %s', get_result.content)
Example #51
0
def trace(payload):
    """
    Write a trace to log file and send it to active mq.

    :param payload: Python dictionary with trace report.
    """

    record_counter('trace.nongrid_trace')
    report = json.dumps(payload, default=date_handler)
    LOGGER.debug(report)

    try:
        conn = random.sample(CONNS, 1)[0]
        if not conn.is_connected():
            logging.info('reconnect to ' + conn.transport._Transport__host_and_ports[0][0])
            conn.start()
            conn.connect(USERNAME, PASSWORD)
        conn.send(body=report, destination=TOPIC, headers={'persistent': 'true', 'appversion': 'rucio'})
    except Exception as exception:
        ERRLOG.error(exception)
Example #52
0
    def __query_details(self, transfer_id):
        """
        Query the detailed status of a transfer in FTS3 via JSON.

        :param transfer_id: FTS transfer identifier as a string.
        :returns: Detailed transfer status information as a dictionary.
        """

        files = None

        files = requests.get('%s/jobs/%s/files' % (self.external_host, transfer_id),
                             verify=self.verify,
                             cert=self.cert,
                             headers={'Content-Type': 'application/json'},
                             timeout=5)
        if files and (files.status_code == 200 or files.status_code == 207):
            record_counter('transfertool.fts3.%s.query_details.success' % self.__extract_host(self.external_host))
            return files.json()

        record_counter('transfertool.fts3.%s.query_details.failure' % self.__extract_host(self.external_host))
        return
Example #53
0
def run_once(bulk, group_bulk, rse_ids, scheme, failover_scheme, transfertool_kwargs, heartbeat_handler, activity):
    worker_number, total_workers, logger = heartbeat_handler.live()

    start_time = time.time()
    transfers = next_transfers_to_submit(
        total_workers=total_workers,
        worker_number=worker_number,
        failover_schemes=failover_scheme,
        limit=bulk,
        activity=activity,
        rses=rse_ids,
        schemes=scheme,
        transfertools_by_name={'fts3': FTS3Transfertool},
        older_than=None,
        request_type=RequestType.STAGEIN,
        logger=logger,
    )
    total_transfers = len(list(hop for paths in transfers.values() for path in paths for hop in path))
    record_timer('daemons.conveyor.stager.get_stagein_transfers.per_transfer', (time.time() - start_time) * 1000 / (total_transfers if transfers else 1))
    record_counter('daemons.conveyor.stager.get_stagein_transfers', total_transfers)
    record_timer('daemons.conveyor.stager.get_stagein_transfers.transfers', total_transfers)
    logger(logging.INFO, 'Got %s stagein transfers for %s' % (total_transfers, activity))

    for builder, transfer_paths in transfers.items():
        transfertool_obj = builder.make_transfertool(logger=logger, **transfertool_kwargs.get(builder.transfertool_class, {}))
        logger(logging.INFO, 'Starting to group transfers for %s (%s)' % (activity, transfertool_obj))
        start_time = time.time()
        grouped_jobs = transfertool_obj.group_into_submit_jobs(transfer_paths)
        record_timer('daemons.conveyor.stager.bulk_group_transfer', (time.time() - start_time) * 1000 / (len(transfer_paths) or 1))

        logger(logging.INFO, 'Starting to submit transfers for %s (%s)' % (activity, transfertool_obj))
        for job in grouped_jobs:
            worker_number, total_workers, logger = heartbeat_handler.live()
            submit_transfer(transfertool_obj=transfertool_obj, transfers=job['transfers'], job_params=job['job_params'], submitter='transfer_submitter', logger=logger)

    queue_empty = False
    if total_transfers < group_bulk:
        queue_empty = True
        logger(logging.INFO, 'Only %s transfers for %s which is less than group bulk %s' % (total_transfers, activity, group_bulk))
    return queue_empty
Example #54
0
    def register_container(self):
        self.container_exists = False
        if self.is_at_pnn and self.dry_run:
            logging.info('Dry Run: Create container %s in scope %s.',
                         self.container, self.scope)
            self.container_exists = True
            return self.container_exists

        try:
            get_did(scope=self.scope, name=self.container)
            monitor.record_counter('cms_sync.container_exists')
            self.container_exists = True
            logging.info('Found container %s', self.container)
        except DataIdentifierNotFound:
            if self.is_at_pnn:
                try:
                    logging.info('Create container %s in scope %s.',
                                 self.container, self.scope)
                    add_did(scope=self.scope,
                            name=self.container,
                            type='CONTAINER',
                            issuer=self.account,
                            lifetime=self.lifetime)
                    monitor.record_counter('cms_sync.container_created')
                    self.container_exists = True
                    logging.info('Created container %s in scope %s.',
                                 self.container, self.scope)
                except DataIdentifierAlreadyExists:
                    logging.warning('Container was created in the meanwhile')
                    monitor.record_counter('cms_sync.container_collision')
                    self.container_exists = True
            else:
                logging.warning('Container was not at PNN')

        return self.container_exists
Example #55
0
    def register_block(self):
        """
        Register the dataset (if there is a replica at the pnn) and attach to container
        :dry: Dry run. Default false.
        """

        # FIXME: The logic here could use some improvement as we try to create a block even if it exists already

        existed, created, attached, already_attached = self.register_and_attach_did(
            scope=self.scope,
            name=self.block_name,
            did_type='DATASET',
            parent_did=self.container)

        if self.is_at_pnn and self.dry_run:
            logging.info('Dry Run: Create dataset %s in scope %s.',
                         self.block_name, self.scope)
            self.block_exists = True

        self.block_exists = existed | created
        if existed:
            monitor.record_counter('cms_sync.dataset_exists')
        if created:
            monitor.record_counter('cms_sync.dataset_created')
        if not existed and not created:
            monitor.record_counter('cms_sync.dataset_create_failed')

        return self.block_exists
Example #56
0
    def update_priority(self, transfer_id, priority, timeout=None):
        """
        Update the priority of a transfer that has been submitted to FTS via JSON.

        :param transfer_id: FTS transfer identifier as a string.
        :param priority:    FTS job priority as an integer from 1 to 5.
        :param timeout:     Timeout in seconds.
        :returns:           True if update was successful.
        """

        job = None
        params_dict = {"params": {"priority": priority}}
        params_str = json.dumps(params_dict, cls=APIEncoder)

        job = requests.post('%s/jobs/%s' % (self.external_host, transfer_id),
                            verify=self.verify,
                            data=params_str,
                            cert=self.cert,
                            headers=self.headers,
                            timeout=timeout)  # TODO set to 3 in conveyor

        if job and job.status_code == 200:
            record_counter('transfertool.fts3.%s.update_priority.success' %
                           self.__extract_host(self.external_host))
            labels = {
                'state': 'success',
                'host': self.__extract_host(self.external_host)
            }
            UPDATE_PRIORITY_COUNTER.labels(**labels).inc()
            return job.json()

        record_counter('transfertool.fts3.%s.update_priority.failure' %
                       self.__extract_host(self.external_host))
        labels = {
            'state': 'failure',
            'host': self.__extract_host(self.external_host)
        }
        UPDATE_PRIORITY_COUNTER.labels(**labels).inc()
        raise Exception('Could not update priority of transfer: %s',
                        job.content)
Example #57
0
    def cancel(self, transfer_ids, timeout=None):
        """
        Cancel transfers that have been submitted to FTS3.

        :param transfer_ids: FTS transfer identifiers as list of strings.
        :param timeout:      Timeout in seconds.
        :returns:            True if cancellation was successful.
        """

        if len(transfer_ids) > 1:
            raise NotImplementedError('Bulk cancelling not implemented')
        transfer_id = transfer_ids[0]

        job = None

        job = requests.delete('%s/jobs/%s' % (self.external_host, transfer_id),
                              verify=self.verify,
                              cert=self.cert,
                              headers=self.headers,
                              timeout=timeout)

        if job and job.status_code == 200:
            record_counter('transfertool.fts3.%s.cancel.success' %
                           self.__extract_host(self.external_host))
            labels = {
                'state': 'success',
                'host': self.__extract_host(self.external_host)
            }
            CANCEL_COUNTER.labels(**labels).inc()
            return job.json()

        record_counter('transfertool.fts3.%s.cancel.failure' %
                       self.__extract_host(self.external_host))
        labels = {
            'state': 'failure',
            'host': self.__extract_host(self.external_host)
        }
        CANCEL_COUNTER.labels(**labels).inc()
        raise Exception('Could not cancel transfer: %s', job.content)
Example #58
0
    def register_block(self):
        """
        Register the dataset (if there is a replica at the pnn) and attach to container
        :dry: Dry run. Default false.
        """

        # FIXME: The logic here could use some improvement as we try to create a block even if it exists already

        try:
            get_did(scope=self.scope, name=self.block_name)
            self.block_exists = True
            monitor.record_counter('cms_sync.dataset_exists')
        except DataIdentifierNotFound:
            self.block_exists = False

        if self.is_at_pnn and self.dry_run:
            logging.info('Dry Run: Create dataset %s in scope %s.',
                         self.block_name, self.scope)
            self.block_exists = True
        elif self.is_at_pnn:
            logging.info('Create block %s in scope %s.', self.block_name,
                         self.scope)
            try:
                if not self.block_exists:
                    add_did(scope=self.scope,
                            name=self.block_name,
                            type='DATASET',
                            issuer=self.account,
                            lifetime=self.lifetime)
                    monitor.record_counter('cms_sync.dataset_created')
            except DataIdentifierAlreadyExists:
                logging.warning('Attempt to add %s:%s failed, already exists.',
                                self.scope, self.block_name)
                monitor.record_counter('cms_sync.dataset_collision')

            try:
                attach_dids(scope=self.scope,
                            name=self.container,
                            attachment={
                                'dids': [{
                                    'scope': self.scope,
                                    'name': self.block_name
                                }]
                            },
                            issuer=self.account)
            except DuplicateContent:
                logging.warning(
                    'Attempt to add %s:%s to %s failed, already exists.',
                    self.scope, self.block_name, self.container)
            except DataIdentifierNotFound:
                logging.error(
                    'Attempt to add %s:%s to %s failed. Container does not exist.',
                    self.scope, self.block_name, self.container)
                return False
            self.block_exists = True
        else:
            logging.warning('Block %s was not at PNN', self.block_name)

        return self.block_exists
Example #59
0
def submit_bulk_transfers(external_host, files, transfertool='fts3', job_params={}, timeout=None, user_transfer_job=False):
    """
    Submit transfer request to a transfertool.

    :param external_host:  External host name as string
    :param files:          List of Dictionary containing request file.
    :param transfertool:   Transfertool as a string.
    :param job_params:     Metadata key/value pairs for all files as a dictionary.
    :returns:              Transfertool external ID.
    """

    record_counter('core.request.submit_transfer')

    transfer_id = None

    if transfertool == 'fts3':
        start_time = time.time()
        job_files = []
        for file in files:
            job_file = {}
            for key in file:
                if key == 'sources':
                    # convert sources from (src_rse, url, src_rse_id, rank) to url
                    job_file[key] = []
                    for source in file[key]:
                        job_file[key].append(source[1])
                else:
                    job_file[key] = file[key]
            job_files.append(job_file)
        if not user_transfer_job:
            transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        elif USER_TRANSFERS == "cms":
            transfer_id = FTS3MyProxyTransfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        else:
            # if no valid USER TRANSFER cases --> go with std submission
            transfer_id = FTS3Transfertool(external_host=external_host).submit(files=job_files, job_params=job_params, timeout=timeout)
        record_timer('core.request.submit_transfers_fts3', (time.time() - start_time) * 1000 / len(files))
    return transfer_id
Example #60
0
    def on_message(self, headers, message):
        record_counter('daemons.tracer.kronos.reports')

        appversion = 'dq2'
        id = headers['message-id']
        if 'appversion' in headers:
            appversion = headers['appversion']

        try:
            if appversion == 'dq2':
                self.__conn.ack(id, self.__subscription_id)
                return
            else:
                report = jloads(message)
        except:
            # message is corrupt, not much to do here
            # send count to graphite, send ack to broker and return
            record_counter('daemons.tracer.kronos.json_error')
            logging.error('(kronos_file) json error')
            self.__conn.ack(id, self.__subscription_id)
            return

        self.__ids.append(id)
        self.__reports.append(report)

        try:
            logging.debug('(kronos_file) message received: %s %s %s' % (str(report['eventType']), report['filename'], report['remoteSite']))
        except:
            pass

        if len(self.__ids) >= self.__chunksize:
            self.__update_atime()
            for id in self.__ids:
                self.__conn.ack(id, self.__subscription_id)

            self.__reports = []
            self.__ids = []