Beispiel #1
0
    def close(self):
        """Clear the buffer, send a poison pill and the total number of docs"""
        if self.buffer:
            self.output_queue.put(self.buffer, timeout=time_remaining(self.starttime))
            self.buffer = []

        logging.warning("Closing listener, received %d documents total", self.count_in)
        # send back a poison pill
        self.output_queue.put(None, timeout=time_remaining(self.starttime))
        # send the number of total docs
        self.output_queue.put(self.count_in, timeout=time_remaining(self.starttime))
Beispiel #2
0
    def run(self):
        since_last_report = 0
        while True:
            try:
                next_batch = self.input_queue.get(
                    timeout=time_remaining(self.starttime - 5))
            except queue.Empty:
                logging.warning(
                    "Closing listener before all schedds were processed")
                self.close()
                return

            if isinstance(next_batch, str):
                schedd_name = str(next_batch)
                try:
                    # We were already processing this sender,
                    # this is the signal that it's done sending.
                    self.tracker.remove(schedd_name)
                    self.n_processed += 1
                except ValueError:
                    # This is a new sender
                    self.tracker.append(schedd_name)

                if self.n_processed == self.n_expected:
                    # We finished processing all expected senders.
                    assert len(self.tracker) == 0
                    self.close()
                    return
                continue

            self.count_in += len(next_batch)
            since_last_report += len(next_batch)
            self.buffer.extend(next_batch)

            if since_last_report > self.report_every:
                logging.debug("Processed %d docs", self.count_in)
                since_last_report = 0

            # If buffer is full, send the docs and clear the buffer
            if len(self.buffer) >= self.bunch_size:
                self.output_queue.put(
                    self.buffer[:self.bunch_size],
                    timeout=time_remaining(self.starttime),
                )
                self.buffer = self.buffer[self.bunch_size:]
Beispiel #3
0
def process_schedd(starttime, last_completion, checkpoint_queue, schedd_ad,
                   args):
    """
    Given a schedd, process its entire set of history since last checkpoint.
    """
    my_start = time.time()
    if time_remaining(starttime) < 0:
        message = ("No time remaining to process %s history; exiting." %
                   schedd_ad['Name'])
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms history timeout warning", message)
        return last_completion

    schedd = htcondor.Schedd(schedd_ad)
    history_query = classad.ExprTree("EnteredCurrentStatus >= %d" %
                                     last_completion)
    logging.info("Querying %s for history: %s.  "
                 "%.1f minutes of ads", schedd_ad["Name"], history_query,
                 (time.time() - last_completion) / 60.)
    buffered_ads = {}
    count = 0
    total_upload = 0
    sent_warnings = False
    timed_out = False
    if not args.read_only:
        if args.feed_es:
            es = htcondor_es.es.get_server_handle(args)
    try:
        if not args.dry_run:
            history_iter = schedd.history(history_query, [], 10000)
        else:
            history_iter = []

        for job_ad in history_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(job_ad, return_dict=True)
            except Exception as e:
                message = (
                    "Failure when converting document on %s history: %s" %
                    (schedd_ad["Name"], str(e)))
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms history document conversion error",
                        message)
                    sent_warnings = True

            if not dict_ad:
                continue

            idx = htcondor_es.es.get_index(job_ad["QDate"],
                                           template=args.es_index_template,
                                           update_es=(args.feed_es
                                                      and not args.read_only))
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((unique_doc_id(dict_ad), dict_ad))

            if len(ad_list) == args.es_bunch_size:
                st = time.time()
                if not args.read_only:
                    if args.feed_es:
                        data_for_es = [(id_, json.dumps(dict_ad))
                                       for id_, dict_ad in ad_list]
                        htcondor_es.es.post_ads(es=es.handle,
                                                idx=idx,
                                                ads=data_for_es)
                    if args.feed_amq:
                        data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \
                                        id_, dict_ad in ad_list]
                        htcondor_es.amq.post_ads(data_for_amq)
                logging.debug("...posting %d ads from %s (process_schedd)",
                              len(ad_list), schedd_ad["Name"])
                total_upload += time.time() - st
                buffered_ads[idx] = []

            count += 1

            # Find the most recent job and use that date as the new
            # last_completion date
            job_completion = job_ad.get("EnteredCurrentStatus")
            if job_completion > last_completion:
                last_completion = job_completion

            if time_remaining(starttime) < 0:
                message = ("History crawler on %s has been running for "
                           "more than %d minutes; exiting." %
                           (schedd_ad["Name"], TIMEOUT_MINS))
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)
                timed_out = True
                break

    except RuntimeError:
        logging.error("Failed to query schedd for job history: %s",
                      schedd_ad["Name"])

    except Exception as exn:
        message = ("Failure when processing schedd history query on %s: %s" %
                   (schedd_ad["Name"], str(exn)))
        logging.exception(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd history query error", message)

    # Post the remaining ads
    for idx, ad_list in buffered_ads.items():
        if ad_list:
            logging.debug(
                "...posting remaining %d ads from %s "
                "(process_schedd)", len(ad_list), schedd_ad["Name"])
            if not args.read_only:
                if args.feed_es:
                    htcondor_es.es.post_ads(es=es.handle,
                                            idx=idx,
                                            ads=[(id_, json.dumps(dict_ad))
                                                 for id_, dict_ad in ad_list])
                if args.feed_amq:
                    data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \
                                    id_, dict_ad in ad_list]
                    htcondor_es.amq.post_ads(data_for_amq)

    total_time = (time.time() - my_start) / 60.
    total_upload /= 60.
    last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime(
        "%Y-%m-%d %H:%M:%S")
    logging.warning(
        "Schedd %-25s history: response count: %5d; last completion %s; "
        "query time %.2f min; upload time %.2f min", schedd_ad["Name"], count,
        last_formatted, total_time - total_upload, total_upload)

    # If we got to this point without a timeout, all these jobs have
    # been processed and uploaded, so we can update the checkpoint
    if not timed_out:
        checkpoint_queue.put((schedd_ad["Name"], last_completion))

    return last_completion
Beispiel #4
0
    def _chkp_updater():
        while True:
            job = checkpoint_queue.get()
            if job is None:  # Swallow poison pill
                break
            update_checkpoint(*job)

    chkp_updater = multiprocessing.Process(target=_chkp_updater)
    chkp_updater.start()

    # Check whether one of the processes timed out and reset their last
    # completion checkpoint in case
    timed_out = False
    for name, future in futures:
        if time_remaining(starttime) > -10:
            try:
                future.get(time_remaining(starttime) + 10)
            except multiprocessing.TimeoutError:
                # This implies that the checkpoint hasn't been updated
                message = "Schedd %s history timed out; ignoring progress." % name
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)

        else:
            timed_out = True
            break
    if timed_out:
        pool.terminate()
Beispiel #5
0
def process_queues(schedd_ads, starttime, pool, args, metadata=None):
    """
    Process all the jobs in all the schedds given.
    """
    my_start = time.time()
    if time_remaining(starttime) < 10:
        logging.warning("No time remaining to process queues")
        return

    metadata = metadata or {}
    metadata["spider_source"] = "condor_queue"

    mp_manager = multiprocessing.Manager()
    input_queue = mp_manager.Queue()
    output_queue = mp_manager.Queue()
    listener = ListenAndBunch(
        input_queue=input_queue,
        output_queue=output_queue,
        n_expected=len(schedd_ads),
        starttime=starttime,
        bunch_size=args.amq_bunch_size,
    )
    futures = []

    upload_pool = multiprocessing.Pool(processes=args.upload_pool_size)

    for schedd_ad in schedd_ads:
        future = pool.apply_async(query_schedd_queue,
                                  args=(starttime, schedd_ad, input_queue,
                                        args))
        futures.append((schedd_ad["Name"], future))

    def _callback_amq(result):
        sent, received, elapsed = result
        logging.info("Uploaded %d/%d docs to StompAMQ in %d seconds", sent,
                     received, elapsed)

    total_processed = 0
    while True:
        if args.dry_run or len(schedd_ads) == 0:
            break

        if time_remaining(starttime) < 5:
            logging.warning(
                "Listener did not shut down properly; terminating.")
            listener.terminate()
            break

        bunch = output_queue.get(timeout=time_remaining(starttime))
        if bunch is None:  # swallow the poison pill
            total_processed = int(
                output_queue.get(timeout=time_remaining(starttime)))
            break

        if args.feed_es_for_queues and not args.read_only:
            ## Note that these bunches are sized according to --amq_bunch_size
            ## FIXME: Why are we determining the index from one ad?
            idx = htcondor_es.es.get_index(
                bunch[0][1].get("QDate", int(time.time())),
                template=args.es_index_template,
                update_es=(args.feed_es and not args.read_only),
            )

            future = upload_pool.apply_async(htcondor_es.es.post_ads_nohandle,
                                             args=(idx, bunch, args, metadata))
            futures.append(("UPLOADER_ES", future))

        if args.feed_amq and not args.read_only:
            amq_bunch = [(id_, convert_dates_to_millisecs(dict_ad))
                         for id_, dict_ad in bunch]
            future = upload_pool.apply_async(
                htcondor_es.amq.post_ads,
                args=(amq_bunch, metadata),
                callback=_callback_amq,
            )
            futures.append(("UPLOADER_AMQ", future))

        logging.info("Starting new uploader, %d items in queue" %
                     output_queue.qsize())

    listener.join()

    timed_out = False
    total_sent = 0
    total_upload_time = 0
    total_queried = 0
    for name, future in futures:
        if time_remaining(starttime, positive=False) > -20:
            try:
                count = future.get(time_remaining(starttime) + 10)
                if name == "UPLOADER_AMQ":
                    total_sent += count[0]
                    total_upload_time += count[2]
                elif name == "UPLOADER_ES":
                    total_sent += count
                else:
                    try:
                        total_queried += count
                    except TypeError:
                        pass
            except multiprocessing.TimeoutError:
                message = "Schedd %s queue timed out; ignoring progress." % name
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms queue timeout warning", message)
        else:
            timed_out = True
            break

    if timed_out:
        logging.error(
            "Timed out when retrieving uploaders. Upload count incomplete.")
        pool.terminate()
        upload_pool.terminate()

    if not total_queried == total_processed:
        logging.warning(
            "Number of queried docs not equal to number of processed docs.")

    logging.warning(
        "Processing time for queues: %.2f mins, %d/%d docs sent in %.2f min "
        "of total upload time",
        (time.time() - my_start) / 60.0,
        total_sent,
        total_queried,
        total_upload_time / 60.0,
    )

    upload_pool.close()
    upload_pool.join()
Beispiel #6
0
def query_schedd_queue(starttime, schedd_ad, queue, args):
    my_start = time.time()
    pool_name = schedd_ad.get("CMS_Pool", "Unknown")
    logging.info("Querying %s queue for jobs.", schedd_ad["Name"])
    if time_remaining(starttime) < 10:
        message = ("No time remaining to run queue crawler on %s; "
                   "exiting." % schedd_ad["Name"])
        logging.error(message)
        send_email_alert(args.email_alerts, "spider_cms queue timeout warning",
                         message)
        return

    count_since_last_report = 0
    count = 0
    cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime
    queue.put(schedd_ad["Name"], timeout=time_remaining(starttime))

    schedd = htcondor.Schedd(schedd_ad)
    sent_warnings = False
    batch = []
    # Query for a snapshot of the jobs running/idle/held,
    # but only the completed that had changed in the last period of time.
    _completed_since = starttime - (TIMEOUT_MINS + 1) * 60
    query = ("""
         (JobStatus < 3 || JobStatus > 4 
         || EnteredCurrentStatus >= %(completed_since)d
         || CRAB_PostJobLastUpdate >= %(completed_since)d
         ) && (CMS_Type != "DONOTMONIT")
         """ % {
        "completed_since": _completed_since
    })
    try:
        query_iter = schedd.xquery(
            requirements=query) if not args.dry_run else []
        for job_ad in query_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(
                    job_ad,
                    return_dict=True,
                    reduce_data=not args.keep_full_queue_data,
                    pool_name=pool_name)
            except Exception as e:
                message = "Failure when converting document on %s queue: %s" % (
                    schedd_ad["Name"],
                    str(e),
                )
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms queue document conversion error",
                        message,
                    )
                    sent_warnings = True

            if not dict_ad:
                continue

            batch.append((unique_doc_id(dict_ad), dict_ad))
            count += 1
            count_since_last_report += 1

            if not args.dry_run and len(batch) == args.query_queue_batch_size:
                if time_remaining(starttime) < 10:
                    message = ("Queue crawler on %s has been running for "
                               "more than %d minutes; exiting" %
                               (schedd_ad["Name"], TIMEOUT_MINS))
                    logging.error(message)
                    send_email_alert(args.email_alerts,
                                     "spider_cms queue timeout warning",
                                     message)
                    break
                queue.put(batch, timeout=time_remaining(starttime))
                batch = []
                if count_since_last_report >= 1000:
                    cpu_usage_now = resource.getrusage(
                        resource.RUSAGE_SELF).ru_utime
                    cpu_usage = cpu_usage_now - cpu_usage
                    processing_rate = count_since_last_report / cpu_usage
                    cpu_usage = cpu_usage_now
                    logging.info(
                        "Processor for %s has processed %d jobs "
                        "(%.1f jobs per CPU-second)",
                        schedd_ad["Name"],
                        count,
                        processing_rate,
                    )
                    count_since_last_report = 0

            if args.max_documents_to_process and count > args.max_documents_to_process:
                logging.warning(
                    "Aborting after %d documents (--max_documents_to_process option)"
                    % args.max_documents_to_process)
                break

    except RuntimeError as e:
        logging.error("Failed to query schedd %s for jobs: %s",
                      schedd_ad["Name"], str(e))
    except Exception as e:
        message = "Failure when processing schedd queue query on %s: %s" % (
            schedd_ad["Name"],
            str(e),
        )
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd queue query error", message)
        traceback.print_exc()

    if batch:  # send remaining docs
        queue.put(batch, timeout=time_remaining(starttime))
        batch = []

    queue.put(schedd_ad["Name"], timeout=time_remaining(starttime))
    total_time = (time.time() - my_start) / 60.0
    logging.warning(
        "Schedd %-25s queue: response count: %5d; "
        "query time %.2f min; ",
        schedd_ad["Name"],
        count,
        total_time,
    )

    return count
Beispiel #7
0
def process_schedd(starttime,
                   last_completion,
                   checkpoint_queue,
                   schedd_ad,
                   args,
                   metadata=None):
    """
    Given a schedd, process its entire set of history since last checkpoint.
    """
    my_start = time.time()
    pool_name = schedd_ad.get("CMS_Pool", "Unknown")
    if time_remaining(starttime) < 0:
        message = ("No time remaining to process %s history; exiting." %
                   schedd_ad["Name"])
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms history timeout warning", message)
        return last_completion

    metadata = metadata or {}
    schedd = htcondor.Schedd(schedd_ad)
    _q = """
        ( EnteredCurrentStatus >= %(last_completion)d 
        || CRAB_PostJobLastUpdate >= %(last_completion)d )
        && (CMS_Type != "DONOTMONIT")
        """
    history_query = classad.ExprTree(_q % {"last_completion": last_completion})
    logging.info(
        "Querying %s for history: %s.  "
        "%.1f minutes of ads",
        schedd_ad["Name"],
        history_query,
        (time.time() - last_completion) / 60.0,
    )
    buffered_ads = {}
    count = 0
    total_upload = 0
    sent_warnings = False
    timed_out = False
    if not args.read_only:
        if args.feed_es:
            es = htcondor_es.es.get_server_handle(args)
    try:
        if not args.dry_run:
            history_iter = schedd.history(history_query, [], 10000)
        else:
            history_iter = []

        for job_ad in history_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(job_ad,
                                          return_dict=True,
                                          pool_name=pool_name)
            except Exception as e:
                message = "Failure when converting document on %s history: %s" % (
                    schedd_ad["Name"],
                    str(e),
                )
                exc = traceback.format_exc()
                message += "\n{}".format(exc)
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms history document conversion error",
                        message,
                    )
                    sent_warnings = True

            if not dict_ad:
                continue

            idx = htcondor_es.es.get_index(
                job_ad["QDate"],
                template=args.es_index_template,
                update_es=(args.feed_es and not args.read_only),
            )
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((unique_doc_id(dict_ad), dict_ad))

            if len(ad_list) == args.es_bunch_size:
                st = time.time()
                if not args.read_only:
                    if args.feed_es:
                        htcondor_es.es.post_ads(es.handle,
                                                idx,
                                                ad_list,
                                                metadata=metadata)
                    if args.feed_amq:
                        data_for_amq = [(id_,
                                         convert_dates_to_millisecs(dict_ad))
                                        for id_, dict_ad in ad_list]
                        htcondor_es.amq.post_ads(data_for_amq,
                                                 metadata=metadata)

                logging.debug(
                    "...posting %d ads from %s (process_schedd)",
                    len(ad_list),
                    schedd_ad["Name"],
                )
                total_upload += time.time() - st
                buffered_ads[idx] = []

            count += 1

            # Find the most recent job and use that date as the new
            # last_completion date
            job_completion = job_ad.get("EnteredCurrentStatus")
            if job_completion > last_completion:
                last_completion = job_completion

            if time_remaining(starttime) < 0:
                message = ("History crawler on %s has been running for "
                           "more than %d minutes; exiting." %
                           (schedd_ad["Name"], TIMEOUT_MINS))
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)
                timed_out = True
                break

            if args.max_documents_to_process and count > args.max_documents_to_process:
                logging.warning(
                    "Aborting after %d documents (--max_documents_to_process option)"
                    % args.max_documents_to_process)
                break

    except RuntimeError:
        message = "Failed to query schedd for job history: %s" % schedd_ad[
            "Name"]
        exc = traceback.format_exc()
        message += "\n{}".format(exc)
        logging.error(message)

    except Exception as exn:
        message = "Failure when processing schedd history query on %s: %s" % (
            schedd_ad["Name"],
            str(exn),
        )
        exc = traceback.format_exc()
        message += "\n{}".format(exc)
        logging.exception(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd history query error", message)

    # Post the remaining ads
    for idx, ad_list in list(buffered_ads.items()):
        if ad_list:
            logging.debug(
                "...posting remaining %d ads from %s "
                "(process_schedd)",
                len(ad_list),
                schedd_ad["Name"],
            )
            if not args.read_only:
                if args.feed_es:
                    htcondor_es.es.post_ads(es.handle,
                                            idx,
                                            ad_list,
                                            metadata=metadata)
                if args.feed_amq:
                    data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad))
                                    for id_, dict_ad in ad_list]
                    htcondor_es.amq.post_ads(data_for_amq, metadata=metadata)

    total_time = (time.time() - my_start) / 60.0
    total_upload /= 60.0
    last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime(
        "%Y-%m-%d %H:%M:%S")
    logging.warning(
        "Schedd %-25s history: response count: %5d; last completion %s; "
        "query time %.2f min; upload time %.2f min",
        schedd_ad["Name"],
        count,
        last_formatted,
        total_time - total_upload,
        total_upload,
    )

    # If we got to this point without a timeout, all these jobs have
    # been processed and uploaded, so we can update the checkpoint
    if not timed_out:
        checkpoint_queue.put((schedd_ad["Name"], last_completion))

    return last_completion
Beispiel #8
0
def process_histories(schedd_ads, starttime, pool, args, metadata=None):
    """
    Process history files for each schedd listed in a given
    multiprocessing pool
    """
    try:
        checkpoint = json.load(open("checkpoint.json"))
    except IOError as ValueError:
        checkpoint = {}

    futures = []
    metadata = metadata or {}
    metadata["spider_source"] = "condor_history"

    manager = multiprocessing.Manager()
    checkpoint_queue = manager.Queue()

    for schedd_ad in schedd_ads:
        name = schedd_ad["Name"]

        # Check for last completion time
        # If there was no previous completion, get last 12 h
        last_completion = checkpoint.get(name, time.time() - 12 * 3600)

        # For CRAB, only ever get a maximum of 12 h
        if name.startswith(
                "crab") and last_completion < time.time() - 12 * 3600:
            last_completion = time.time() - 12 * 3600

        future = pool.apply_async(
            process_schedd,
            (starttime, last_completion, checkpoint_queue, schedd_ad, args,
             metadata),
        )
        futures.append((name, future))

    def _chkp_updater():
        while True:
            try:
                job = checkpoint_queue.get()
                if job is None:  # Swallow poison pill
                    break
            except EOFError as error:
                logging.warning(
                    "EOFError - Nothing to consume left in the queue %s",
                    error)
                break
            update_checkpoint(*job)

    chkp_updater = multiprocessing.Process(target=_chkp_updater)
    chkp_updater.start()

    # Check whether one of the processes timed out and reset their last
    # completion checkpoint in case
    timed_out = False
    for name, future in futures:
        if time_remaining(starttime) > -10:
            try:
                future.get(time_remaining(starttime) + 10)
            except multiprocessing.TimeoutError:
                # This implies that the checkpoint hasn't been updated
                message = "Schedd %s history timed out; ignoring progress." % name
                exc = traceback.format_exc()
                message += "\n{}".format(exc)
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)
            except elasticsearch.exceptions.TransportError:
                message = (
                    "Transport error while sending history data of %s; ignoring progress."
                    % name)
                exc = traceback.format_exc()
                message += "\n{}".format(exc)
                logging.error(message)
                send_email_alert(
                    args.email_alerts,
                    "spider_cms history transport error warning",
                    message,
                )
        else:
            timed_out = True
            break
    if timed_out:
        pool.terminate()

    checkpoint_queue.put(None)  # Send a poison pill
    chkp_updater.join()

    logging.warning("Processing time for history: %.2f mins",
                    ((time.time() - starttime) / 60.0))
Beispiel #9
0
def process_queues(schedd_ads, starttime, pool, args):
    """
    Process all the jobs in all the schedds given.
    """
    my_start = time.time()
    if time_remaining(starttime) < 0:
        logging.warning("No time remaining to process queues")
        return

    mp_manager = multiprocessing.Manager()
    input_queue = mp_manager.Queue(maxsize=10)
    output_queue = mp_manager.Queue(maxsize=2)
    listener = ListenAndBunch(input_queue=input_queue,
                              output_queue=output_queue,
                              n_expected=len(schedd_ads),
                              starttime=starttime,
                              bunch_size=5000)
    futures = []

    upload_pool = multiprocessing.Pool(processes=3)

    for schedd_ad in schedd_ads:
        future = pool.apply_async(process_schedd_queue,
                                  args=(starttime, schedd_ad, input_queue,
                                        args))
        futures.append((schedd_ad['Name'], future))

    def _callback_amq(result):
        sent, received, elapsed = result
        logging.info("Uploaded %d/%d docs to StompAMQ in %d seconds", sent,
                     received, elapsed)

    total_processed = 0
    while True:
        if args.dry_run or len(schedd_ads) == 0:
            break

        if time_remaining(starttime) < -10:
            logging.warning(
                "Listener did not shut down properly; terminating.")
            listener.terminate()
            break

        bunch = output_queue.get(timeout=time_remaining(starttime))
        if bunch is None:  # swallow the poison pill
            total_processed = int(
                output_queue.get(timeout=time_remaining(starttime)))
            break

        if args.feed_amq and not args.read_only:
            amq_bunch = [(id_, convert_dates_to_millisecs(dict_ad))
                         for id_, dict_ad in bunch]
            future = upload_pool.apply_async(htcondor_es.amq.post_ads,
                                             args=(amq_bunch, ),
                                             callback=_callback_amq)
            futures.append(("UPLOADER_AMQ", future))

        if args.feed_es_for_queues and not args.read_only:
            es_bunch = [(id_, json.dumps(dict_ad)) for id_, dict_ad in bunch]
            ## FIXME: Why are we determining the index from one ad?
            idx = htcondor_es.es.get_index(bunch[0][1].get(
                "QDate", int(time.time())),
                                           template=args.es_index_template,
                                           update_es=(args.feed_es
                                                      and not args.read_only))

            future = upload_pool.apply_async(htcondor_es.es.post_ads_nohandle,
                                             args=(idx, es_bunch, args))
            futures.append(("UPLOADER_ES", future))

        max_in_progress = 3
        count = len(futures)
        while count > max_in_progress:
            if time_remaining(starttime) < 0:
                break
            for future in futures:
                if future[1].ready():
                    count -= 1
            if count > max_in_progress:
                break
            for future in futures:
                future.wait(time_remaining(starttime) + 10)
                break
            count = len(futures)

    listener.join()

    timed_out = False
    total_sent = 0
    total_upload_time = 0
    total_queried = 0
    for name, future in futures:
        if time_remaining(starttime) > -10:
            try:
                count = future.get(time_remaining(starttime) + 10)
                if name == "UPLOADER_AMQ":
                    total_sent += count[0]
                    total_upload_time += count[2]
                elif name == "UPLOADER_ES":
                    total_sent += count
                else:
                    total_queried += count
            except multiprocessing.TimeoutError:
                message = "Schedd %s queue timed out; ignoring progress." % name
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms queue timeout warning", message)
        else:
            timed_out = True
            break

    if timed_out:
        pool.terminate()
        upload_pool.terminate()

    if not total_queried == total_processed:
        logging.warning(
            "Number of queried docs not equal to number of processed docs.")

    logging.warning(
        "Processing time for queues: %.2f mins, %d/%d docs sent in %.2f min "
        "of total upload time", (time.time() - my_start) / 60., total_sent,
        total_queried, total_upload_time / 60.)

    upload_pool.close()
    upload_pool.join()
Beispiel #10
0
def process_schedd_queue(starttime, schedd_ad, queue, args):
    my_start = time.time()
    logging.info("Querying %s queue for jobs.", schedd_ad["Name"])
    if time_remaining(starttime) < 0:
        message = ("No time remaining to run queue crawler on %s; "
                   "exiting." % schedd_ad['Name'])
        logging.error(message)
        send_email_alert(args.email_alerts, "spider_cms queue timeout warning",
                         message)
        return

    count_since_last_report = 0
    count = 0
    cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime
    queue.put(schedd_ad['Name'], timeout=time_remaining(starttime))

    schedd = htcondor.Schedd(schedd_ad)
    sent_warnings = False
    batch = []
    try:
        query_iter = schedd.xquery() if not args.dry_run else []
        for job_ad in query_iter:
            dict_ad = None
            try:
                dict_ad = convert_to_json(job_ad,
                                          return_dict=True,
                                          reduce_data=args.reduce_running_data)
            except Exception as e:
                message = ("Failure when converting document on %s queue: %s" %
                           (schedd_ad["Name"], str(e)))
                logging.warning(message)
                if not sent_warnings:
                    send_email_alert(
                        args.email_alerts,
                        "spider_cms queue document conversion error", message)
                    sent_warnings = True

            if not dict_ad:
                continue

            batch.append((job_ad["GlobalJobId"], dict_ad))
            count += 1
            count_since_last_report += 1

            if not args.dry_run and len(batch) == args.query_queue_batch_size:
                if time_remaining(starttime) < 0:
                    message = ("Queue crawler on %s has been running for "
                               "more than %d minutes; exiting" %
                               (schedd_ad['Name'], TIMEOUT_MINS))
                    logging.error(message)
                    send_email_alert(args.email_alerts,
                                     "spider_cms queue timeout warning",
                                     message)
                    break
                queue.put(batch, timeout=time_remaining(starttime))
                batch = []
                if count_since_last_report >= 1000:
                    cpu_usage_now = resource.getrusage(
                        resource.RUSAGE_SELF).ru_utime
                    cpu_usage = cpu_usage_now - cpu_usage
                    processing_rate = count_since_last_report / cpu_usage
                    cpu_usage = cpu_usage_now
                    logging.info(
                        "Processor for %s has processed %d jobs "
                        "(%.1f jobs per CPU-second)", schedd_ad['Name'], count,
                        processing_rate)
                    count_since_last_report = 0

        if batch:  # send remaining docs
            queue.put(batch, timeout=time_remaining(starttime))
            batch = []

    except RuntimeError, e:
        logging.error("Failed to query schedd %s for jobs: %s",
                      schedd_ad["Name"], str(e))
Beispiel #11
0
        if batch:  # send remaining docs
            queue.put(batch, timeout=time_remaining(starttime))
            batch = []

    except RuntimeError, e:
        logging.error("Failed to query schedd %s for jobs: %s",
                      schedd_ad["Name"], str(e))
    except Exception, e:
        message = ("Failure when processing schedd queue query on %s: %s" %
                   (schedd_ad["Name"], str(e)))
        logging.error(message)
        send_email_alert(args.email_alerts,
                         "spider_cms schedd queue query error", message)
        traceback.print_exc()

    queue.put(schedd_ad['Name'], timeout=time_remaining(starttime))
    total_time = (time.time() - my_start) / 60.
    logging.warning(
        "Schedd %-25s queue: response count: %5d; "
        "query time %.2f min; ", schedd_ad["Name"], count, total_time)

    return count


def process_queues(schedd_ads, starttime, pool, args):
    """
    Process all the jobs in all the schedds given.
    """
    my_start = time.time()
    if time_remaining(starttime) < 0:
        logging.warning("No time remaining to process queues")
Beispiel #12
0
def process_histories(schedd_ads, starttime, pool, args):
    """
    Process history files for each schedd listed in a given
    multiprocessing pool
    """
    try:
        checkpoint = json.load(open("checkpoint.json"))
    except:
        checkpoint = {}

    futures = []

    for schedd_ad in schedd_ads:
        name = schedd_ad["Name"]

        # Check for last completion time
        # If there was no previous completion, get last 12 h
        last_completion = checkpoint.get(name, time.time() - 12 * 3600)

        # For CRAB, only ever get a maximum of 12 h
        if name.startswith(
                "crab") and last_completion < time.time() - 12 * 3600:
            last_completion = time.time() - 12 * 3600

        future = pool.apply_async(
            process_schedd, (starttime, last_completion, schedd_ad, args))
        futures.append((name, future))

    # Check whether one of the processes timed out and reset their last
    # completion checkpoint in case
    timed_out = False
    for name, future in futures:
        if time_remaining(starttime) > -10:
            try:
                last_completion = future.get(time_remaining(starttime) + 10)
                if name:
                    checkpoint[name] = last_completion

            except multiprocessing.TimeoutError:
                message = "Schedd %s history timed out; ignoring progress." % name
                logging.error(message)
                send_email_alert(args.email_alerts,
                                 "spider_cms history timeout warning", message)

        else:
            timed_out = True
            break
    if timed_out:
        pool.terminate()

    # Update the last completion checkpoint file
    try:
        checkpoint_new = json.load(open("checkpoint.json"))
    except:
        checkpoint_new = {}

    for key, val in checkpoint.items():
        if (key not in checkpoint_new) or (val > checkpoint_new[key]):
            checkpoint_new[key] = val

    fd, tmpname = tempfile.mkstemp(dir=".", prefix="checkpoint.json.new")
    fd = os.fdopen(fd, "w")
    json.dump(checkpoint_new, fd)
    fd.close()
    os.rename(tmpname, "checkpoint.json")

    logging.warning("Processing time for history: %.2f mins",
                    ((time.time() - starttime) / 60.))