Ejemplo n.º 1
0
def submitter(once=False,
              rses=[],
              mock=False,
              process=0,
              total_processes=1,
              total_threads=1,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              fts_source_strategy='auto',
              activities=None,
              sleep_time=600,
              max_sources=4,
              retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    logging.info('Transfer submitter starting - process (%i/%i) threads (%i)' %
                 (process, total_processes, total_threads))

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    hb = heartbeat.live(executable, hostname, pid, hb_thread)

    logging.info(
        'Transfer submitter started - process (%i/%i) threads (%i/%i) timeout (%s)'
        % (process, total_processes, hb['assign_thread'], hb['nr_threads'],
           timeout))

    threadPool = ThreadPool(total_threads)
    activity_next_exe_time = defaultdict(time.time)

    while not graceful_stop.is_set():

        try:
            hb = heartbeat.live(executable,
                                hostname,
                                pid,
                                hb_thread,
                                older_than=3600)

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info("CMS user transfer activity")
                    user_transfer = True

                logging.info(
                    "%s:%s Starting to get transfer transfers for %s" %
                    (process, hb['assign_thread'], activity))
                ts = time.time()
                transfers = __get_transfers(process=process,
                                            total_processes=total_processes,
                                            thread=hb['assign_thread'],
                                            total_threads=hb['nr_threads'],
                                            failover_schemes=failover_scheme,
                                            limit=bulk,
                                            activity=activity,
                                            rses=rse_ids,
                                            schemes=scheme,
                                            mock=mock,
                                            max_sources=max_sources,
                                            bring_online=bring_online,
                                            retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - ts) * 1000 /
                    (len(transfers) if len(transfers) else 1))
                record_counter(
                    'daemons.conveyor.transfer_submitter.get_transfers',
                    len(transfers))
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    len(transfers))
                logging.info(
                    "%s:%s Got %s transfers for %s" %
                    (process, hb['assign_thread'], len(transfers), activity))

                # group transfers
                logging.info("%s:%s Starting to group transfers for %s" %
                             (process, hb['assign_thread'], activity))
                ts = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk,
                                                   fts_source_strategy,
                                                   max_time_in_queue)
                record_timer(
                    'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                    (time.time() - ts) * 1000 /
                    (len(transfers) if len(transfers) else 1))

                logging.info("%s:%s Starting to submit transfers for %s" %
                             (process, hb['assign_thread'], activity))

                for external_host in grouped_jobs:
                    if not user_transfer:
                        for job in grouped_jobs[external_host]:
                            # submit transfers
                            job_requests = makeRequests(
                                submit_transfer,
                                args_list=[((), {
                                    'external_host': external_host,
                                    'job': job,
                                    'submitter': 'transfer_submitter',
                                    'process': process,
                                    'thread': hb['assign_thread'],
                                    'timeout': timeout
                                })])
                            [
                                threadPool.putRequest(job_req)
                                for job_req in job_requests
                            ]
                    else:
                        for user, jobs in grouped_jobs[
                                external_host].iteritems():
                            # submit transfers
                            for job in jobs:
                                job_requests = makeRequests(
                                    submit_transfer,
                                    args_list=[((), {
                                        'external_host':
                                        external_host,
                                        'job':
                                        job,
                                        'submitter':
                                        'transfer_submitter',
                                        'process':
                                        process,
                                        'thread':
                                        hb['assign_thread'],
                                        'timeout':
                                        timeout,
                                        'user_transfer_job':
                                        user_transfer
                                    })])
                                [
                                    threadPool.putRequest(job_req)
                                    for job_req in job_requests
                                ]
                threadPool.wait()

                if len(transfers) < group_bulk:
                    logging.info(
                        '%i:%i - only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (process, hb['assign_thread'], len(transfers),
                           activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except:
            logging.critical(
                '%s:%s %s' %
                (process, hb['assign_thread'], traceback.format_exc()))

        if once:
            break

    logging.info('%s:%s graceful stop requested' %
                 (process, hb['assign_thread']))

    threadPool.dismissWorkers(total_threads, do_join=True)
    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s:%s graceful stop done' % (process, hb['assign_thread']))
    return
Ejemplo n.º 2
0
def stager(once=False,
           rses=None,
           mock=False,
           bulk=100,
           group_bulk=1,
           group_policy='rule',
           source_strategy=None,
           activities=None,
           sleep_time=600,
           retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None

    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}
    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s" %
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = ' '.join(sys.argv)
    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Stager starting with bring_online %s seconds' %
                 (bring_online))

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info(prepend_str + 'Stager started')

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None

            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                logging.info(prepend_str +
                             'Starting to get stagein transfers for %s' %
                             (activity))
                start_time = time.time()
                transfers = __get_stagein_transfers(
                    total_workers=heart_beat['nr_threads'],
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    mock=mock,
                    schemes=scheme,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter('daemons.conveyor.stager.get_stagein_transfers',
                               len(transfers))
                record_timer(
                    'daemons.conveyor.stager.get_stagein_transfers.transfers',
                    len(transfers))
                logging.info(prepend_str + 'Got %s stagein transfers for %s' %
                             (len(transfers), activity))

                # group transfers
                logging.info(prepend_str +
                             'Starting to group transfers for %s' % (activity))
                start_time = time.time()
                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer('daemons.conveyor.stager.bulk_group_transfer',
                             (time.time() - start_time) * 1000 /
                             (len(transfers) if transfers else 1))

                logging.info(prepend_str +
                             'Starting to submit transfers for %s' %
                             (activity))
                # submit transfers
                for external_host in grouped_jobs:
                    for job in grouped_jobs[external_host]:
                        # submit transfers
                        submit_transfer(external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str)

                if len(transfers) < group_bulk:
                    logging.info(
                        prepend_str +
                        'Only %s transfers for %s which is less than group bulk %s, sleep %s seconds'
                        % (len(transfers), activity, group_bulk, sleep_time))
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical(prepend_str + '%s' % (traceback.format_exc()))

        if once:
            break

    logging.info(prepend_str + 'Graceful stop requested')

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info(prepend_str + 'Graceful stop done')
Ejemplo n.º 3
0
def submitter(once=False,
              rses=None,
              mock=False,
              bulk=100,
              group_bulk=1,
              group_policy='rule',
              source_strategy=None,
              activities=None,
              sleep_time=600,
              max_sources=4,
              retry_other_fts=False):
    """
    Main loop to submit a new transfer primitive to a transfertool.
    """

    try:
        scheme = config_get('conveyor', 'scheme')
    except NoOptionError:
        scheme = None
    try:
        failover_scheme = config_get('conveyor', 'failover_scheme')
    except NoOptionError:
        failover_scheme = None
    try:
        timeout = config_get('conveyor', 'submit_timeout')
        timeout = float(timeout)
    except NoOptionError:
        timeout = None

    try:
        bring_online = config_get('conveyor', 'bring_online')
    except NoOptionError:
        bring_online = 43200

    try:
        max_time_in_queue = {}
        timelife_conf = config_get('conveyor', 'max_time_in_queue')
        timelife_confs = timelife_conf.split(",")
        for conf in timelife_confs:
            act, timelife = conf.split(":")
            max_time_in_queue[act.strip()] = int(timelife.strip())
    except NoOptionError:
        max_time_in_queue = {}

    if 'default' not in max_time_in_queue:
        max_time_in_queue['default'] = 168
    logging.debug("Maximum time in queue for different activities: %s",
                  max_time_in_queue)

    activity_next_exe_time = defaultdict(time.time)
    executable = sys.argv[0]
    if activities:
        activities.sort()
        executable += '--activities ' + str(activities)

    hostname = socket.getfqdn()
    pid = os.getpid()
    hb_thread = threading.current_thread()
    heartbeat.sanity_check(executable=executable, hostname=hostname)
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info('%s Submitter starting with timeout %s', prepend_str, timeout)

    time.sleep(
        10
    )  # To prevent running on the same partition if all the poller restart at the same time
    heart_beat = heartbeat.live(executable, hostname, pid, hb_thread)
    prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                         heart_beat['nr_threads'])
    logging.info('%s Transfer submitter started', prepend_str)

    while not graceful_stop.is_set():

        try:
            heart_beat = heartbeat.live(executable,
                                        hostname,
                                        pid,
                                        hb_thread,
                                        older_than=3600)
            prepend_str = 'Thread [%i/%i] : ' % (heart_beat['assign_thread'],
                                                 heart_beat['nr_threads'])

            if activities is None:
                activities = [None]
            if rses:
                rse_ids = [rse['id'] for rse in rses]
            else:
                rse_ids = None
            for activity in activities:
                if activity_next_exe_time[activity] > time.time():
                    graceful_stop.wait(1)
                    continue

                user_transfer = False

                if activity in USER_ACTIVITY and USER_TRANSFERS in ['cms']:
                    logging.info('%s CMS user transfer activity', prepend_str)
                    user_transfer = True

                logging.info('%s Starting to get transfer transfers for %s',
                             prepend_str, activity)
                start_time = time.time()
                transfers = __get_transfers(
                    total_workers=heart_beat['nr_threads'] - 1,
                    worker_number=heart_beat['assign_thread'],
                    failover_schemes=failover_scheme,
                    limit=bulk,
                    activity=activity,
                    rses=rse_ids,
                    schemes=scheme,
                    mock=mock,
                    max_sources=max_sources,
                    bring_online=bring_online,
                    retry_other_fts=retry_other_fts)
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.per_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))
                record_counter(
                    'daemons.conveyor.transfer_submitter.get_transfers',
                    len(transfers))
                GET_TRANSFERS_COUNTER.inc(len(transfers))
                record_timer(
                    'daemons.conveyor.transfer_submitter.get_transfers.transfers',
                    len(transfers))
                logging.info('%s Got %s transfers for %s in %s seconds',
                             prepend_str, len(transfers), activity,
                             time.time() - start_time)

                # group transfers
                logging.info('%s Starting to group transfers for %s',
                             prepend_str, activity)
                start_time = time.time()

                grouped_jobs = bulk_group_transfer(transfers, group_policy,
                                                   group_bulk, source_strategy,
                                                   max_time_in_queue)
                record_timer(
                    'daemons.conveyor.transfer_submitter.bulk_group_transfer',
                    (time.time() - start_time) * 1000 /
                    (len(transfers) if transfers else 1))

                logging.info('%s Starting to submit transfers for %s',
                             prepend_str, activity)

                if TRANSFER_TOOL == 'fts3':
                    for external_host in grouped_jobs:
                        if not user_transfer:
                            for job in grouped_jobs[external_host]:
                                # submit transfers
                                submit_transfer(
                                    external_host=external_host,
                                    job=job,
                                    submitter='transfer_submitter',
                                    logging_prepend_str=prepend_str,
                                    timeout=timeout)
                        else:
                            for _, jobs in iteritems(
                                    grouped_jobs[external_host]):
                                # submit transfers
                                for job in jobs:
                                    submit_transfer(
                                        external_host=external_host,
                                        job=job,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str,
                                        timeout=timeout,
                                        user_transfer_job=user_transfer)
                elif TRANSFER_TOOL == 'globus':
                    if TRANSFER_TYPE == 'bulk':
                        # build bulk job file list per external host to send to submit_transfer
                        for external_host in grouped_jobs:
                            # pad the job with job_params; irrelevant for globus but needed for further rucio parsing
                            submitjob = {
                                'files': [],
                                'job_params':
                                grouped_jobs[''][0].get('job_params')
                            }
                            for job in grouped_jobs[external_host]:
                                submitjob.get('files').append(
                                    job.get('files')[0])
                            logging.debug('submitjob: %s' % submitjob)
                            submit_transfer(external_host=external_host,
                                            job=submitjob,
                                            submitter='transfer_submitter',
                                            logging_prepend_str=prepend_str,
                                            timeout=timeout)
                    else:
                        # build single job files and individually send to submit_transfer
                        job_params = grouped_jobs[''][0].get(
                            'job_params') if grouped_jobs else None
                        for external_host in grouped_jobs:
                            for job in grouped_jobs[external_host]:
                                for file in job['files']:
                                    singlejob = {
                                        'files': [file],
                                        'job_params': job_params
                                    }
                                    logging.debug('singlejob: %s' % singlejob)
                                    submit_transfer(
                                        external_host=external_host,
                                        job=singlejob,
                                        submitter='transfer_submitter',
                                        logging_prepend_str=prepend_str,
                                        timeout=timeout)
                else:
                    logging.error(prepend_str + 'Unknown transfer tool')

                if len(transfers) < group_bulk:
                    logging.info(
                        '%s Only %s transfers for %s which is less than group bulk %s, sleep %s seconds',
                        prepend_str, len(transfers), activity, group_bulk,
                        sleep_time)
                    if activity_next_exe_time[activity] < time.time():
                        activity_next_exe_time[activity] = time.time(
                        ) + sleep_time
        except Exception:
            logging.critical('%s %s', prepend_str, str(traceback.format_exc()))

        if once:
            break

    logging.info('%s Graceful stop requested', prepend_str)

    heartbeat.die(executable, hostname, pid, hb_thread)

    logging.info('%s Graceful stop done', prepend_str)
    return