def query_schedd_queue(starttime, schedd_ad, queue, args): my_start = time.time() pool_name = schedd_ad.get("CMS_Pool", "Unknown") logging.info("Querying %s queue for jobs.", schedd_ad["Name"]) if time_remaining(starttime) < 10: message = ("No time remaining to run queue crawler on %s; " "exiting." % schedd_ad["Name"]) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) return count_since_last_report = 0 count = 0 cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime queue.put(schedd_ad["Name"], timeout=time_remaining(starttime)) schedd = htcondor.Schedd(schedd_ad) sent_warnings = False batch = [] # Query for a snapshot of the jobs running/idle/held, # but only the completed that had changed in the last period of time. _completed_since = starttime - (TIMEOUT_MINS + 1) * 60 query = (""" (JobStatus < 3 || JobStatus > 4 || EnteredCurrentStatus >= %(completed_since)d || CRAB_PostJobLastUpdate >= %(completed_since)d ) && (CMS_Type != "DONOTMONIT") """ % { "completed_since": _completed_since }) try: query_iter = schedd.xquery( requirements=query) if not args.dry_run else [] for job_ad in query_iter: dict_ad = None try: dict_ad = convert_to_json( job_ad, return_dict=True, reduce_data=not args.keep_full_queue_data, pool_name=pool_name) except Exception as e: message = "Failure when converting document on %s queue: %s" % ( schedd_ad["Name"], str(e), ) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms queue document conversion error", message, ) sent_warnings = True if not dict_ad: continue batch.append((unique_doc_id(dict_ad), dict_ad)) count += 1 count_since_last_report += 1 if not args.dry_run and len(batch) == args.query_queue_batch_size: if time_remaining(starttime) < 10: message = ("Queue crawler on %s has been running for " "more than %d minutes; exiting" % (schedd_ad["Name"], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) break queue.put(batch, timeout=time_remaining(starttime)) batch = [] if count_since_last_report >= 1000: cpu_usage_now = resource.getrusage( resource.RUSAGE_SELF).ru_utime cpu_usage = cpu_usage_now - cpu_usage processing_rate = count_since_last_report / cpu_usage cpu_usage = cpu_usage_now logging.info( "Processor for %s has processed %d jobs " "(%.1f jobs per CPU-second)", schedd_ad["Name"], count, processing_rate, ) count_since_last_report = 0 if args.max_documents_to_process and count > args.max_documents_to_process: logging.warning( "Aborting after %d documents (--max_documents_to_process option)" % args.max_documents_to_process) break except RuntimeError as e: logging.error("Failed to query schedd %s for jobs: %s", schedd_ad["Name"], str(e)) except Exception as e: message = "Failure when processing schedd queue query on %s: %s" % ( schedd_ad["Name"], str(e), ) logging.error(message) send_email_alert(args.email_alerts, "spider_cms schedd queue query error", message) traceback.print_exc() if batch: # send remaining docs queue.put(batch, timeout=time_remaining(starttime)) batch = [] queue.put(schedd_ad["Name"], timeout=time_remaining(starttime)) total_time = (time.time() - my_start) / 60.0 logging.warning( "Schedd %-25s queue: response count: %5d; " "query time %.2f min; ", schedd_ad["Name"], count, total_time, ) return count
def process_histories(schedd_ads, starttime, pool, args, metadata=None): """ Process history files for each schedd listed in a given multiprocessing pool """ try: checkpoint = json.load(open("checkpoint.json")) except IOError as ValueError: checkpoint = {} futures = [] metadata = metadata or {} metadata["spider_source"] = "condor_history" manager = multiprocessing.Manager() checkpoint_queue = manager.Queue() for schedd_ad in schedd_ads: name = schedd_ad["Name"] # Check for last completion time # If there was no previous completion, get last 12 h last_completion = checkpoint.get(name, time.time() - 12 * 3600) # For CRAB, only ever get a maximum of 12 h if name.startswith( "crab") and last_completion < time.time() - 12 * 3600: last_completion = time.time() - 12 * 3600 future = pool.apply_async( process_schedd, (starttime, last_completion, checkpoint_queue, schedd_ad, args, metadata), ) futures.append((name, future)) def _chkp_updater(): while True: try: job = checkpoint_queue.get() if job is None: # Swallow poison pill break except EOFError as error: logging.warning( "EOFError - Nothing to consume left in the queue %s", error) break update_checkpoint(*job) chkp_updater = multiprocessing.Process(target=_chkp_updater) chkp_updater.start() # Check whether one of the processes timed out and reset their last # completion checkpoint in case timed_out = False for name, future in futures: if time_remaining(starttime) > -10: try: future.get(time_remaining(starttime) + 10) except multiprocessing.TimeoutError: # This implies that the checkpoint hasn't been updated message = "Schedd %s history timed out; ignoring progress." % name exc = traceback.format_exc() message += "\n{}".format(exc) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) except elasticsearch.exceptions.TransportError: message = ( "Transport error while sending history data of %s; ignoring progress." % name) exc = traceback.format_exc() message += "\n{}".format(exc) logging.error(message) send_email_alert( args.email_alerts, "spider_cms history transport error warning", message, ) else: timed_out = True break if timed_out: pool.terminate() checkpoint_queue.put(None) # Send a poison pill chkp_updater.join() logging.warning("Processing time for history: %.2f mins", ((time.time() - starttime) / 60.0))
def process_schedd(starttime, last_completion, checkpoint_queue, schedd_ad, args, metadata=None): """ Given a schedd, process its entire set of history since last checkpoint. """ my_start = time.time() pool_name = schedd_ad.get("CMS_Pool", "Unknown") if time_remaining(starttime) < 0: message = ("No time remaining to process %s history; exiting." % schedd_ad["Name"]) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) return last_completion metadata = metadata or {} schedd = htcondor.Schedd(schedd_ad) _q = """ ( EnteredCurrentStatus >= %(last_completion)d || CRAB_PostJobLastUpdate >= %(last_completion)d ) && (CMS_Type != "DONOTMONIT") """ history_query = classad.ExprTree(_q % {"last_completion": last_completion}) logging.info( "Querying %s for history: %s. " "%.1f minutes of ads", schedd_ad["Name"], history_query, (time.time() - last_completion) / 60.0, ) buffered_ads = {} count = 0 total_upload = 0 sent_warnings = False timed_out = False if not args.read_only: if args.feed_es: es = htcondor_es.es.get_server_handle(args) try: if not args.dry_run: history_iter = schedd.history(history_query, [], 10000) else: history_iter = [] for job_ad in history_iter: dict_ad = None try: dict_ad = convert_to_json(job_ad, return_dict=True, pool_name=pool_name) except Exception as e: message = "Failure when converting document on %s history: %s" % ( schedd_ad["Name"], str(e), ) exc = traceback.format_exc() message += "\n{}".format(exc) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms history document conversion error", message, ) sent_warnings = True if not dict_ad: continue idx = htcondor_es.es.get_index( job_ad["QDate"], template=args.es_index_template, update_es=(args.feed_es and not args.read_only), ) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((unique_doc_id(dict_ad), dict_ad)) if len(ad_list) == args.es_bunch_size: st = time.time() if not args.read_only: if args.feed_es: htcondor_es.es.post_ads(es.handle, idx, ad_list, metadata=metadata) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq, metadata=metadata) logging.debug( "...posting %d ads from %s (process_schedd)", len(ad_list), schedd_ad["Name"], ) total_upload += time.time() - st buffered_ads[idx] = [] count += 1 # Find the most recent job and use that date as the new # last_completion date job_completion = job_ad.get("EnteredCurrentStatus") if job_completion > last_completion: last_completion = job_completion if time_remaining(starttime) < 0: message = ("History crawler on %s has been running for " "more than %d minutes; exiting." % (schedd_ad["Name"], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) timed_out = True break if args.max_documents_to_process and count > args.max_documents_to_process: logging.warning( "Aborting after %d documents (--max_documents_to_process option)" % args.max_documents_to_process) break except RuntimeError: message = "Failed to query schedd for job history: %s" % schedd_ad[ "Name"] exc = traceback.format_exc() message += "\n{}".format(exc) logging.error(message) except Exception as exn: message = "Failure when processing schedd history query on %s: %s" % ( schedd_ad["Name"], str(exn), ) exc = traceback.format_exc() message += "\n{}".format(exc) logging.exception(message) send_email_alert(args.email_alerts, "spider_cms schedd history query error", message) # Post the remaining ads for idx, ad_list in list(buffered_ads.items()): if ad_list: logging.debug( "...posting remaining %d ads from %s " "(process_schedd)", len(ad_list), schedd_ad["Name"], ) if not args.read_only: if args.feed_es: htcondor_es.es.post_ads(es.handle, idx, ad_list, metadata=metadata) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq, metadata=metadata) total_time = (time.time() - my_start) / 60.0 total_upload /= 60.0 last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime( "%Y-%m-%d %H:%M:%S") logging.warning( "Schedd %-25s history: response count: %5d; last completion %s; " "query time %.2f min; upload time %.2f min", schedd_ad["Name"], count, last_formatted, total_time - total_upload, total_upload, ) # If we got to this point without a timeout, all these jobs have # been processed and uploaded, so we can update the checkpoint if not timed_out: checkpoint_queue.put((schedd_ad["Name"], last_completion)) return last_completion
def process_queues(schedd_ads, starttime, pool, args): """ Process all the jobs in all the schedds given. """ my_start = time.time() if time_remaining(starttime) < 0: logging.warning("No time remaining to process queues") return mp_manager = multiprocessing.Manager() input_queue = mp_manager.Queue(maxsize=10) output_queue = mp_manager.Queue(maxsize=2) listener = ListenAndBunch(input_queue=input_queue, output_queue=output_queue, n_expected=len(schedd_ads), starttime=starttime, bunch_size=5000) futures = [] upload_pool = multiprocessing.Pool(processes=3) for schedd_ad in schedd_ads: future = pool.apply_async(process_schedd_queue, args=(starttime, schedd_ad, input_queue, args)) futures.append((schedd_ad['Name'], future)) def _callback_amq(result): sent, received, elapsed = result logging.info("Uploaded %d/%d docs to StompAMQ in %d seconds", sent, received, elapsed) total_processed = 0 while True: if args.dry_run or len(schedd_ads) == 0: break if time_remaining(starttime) < -10: logging.warning( "Listener did not shut down properly; terminating.") listener.terminate() break bunch = output_queue.get(timeout=time_remaining(starttime)) if bunch is None: # swallow the poison pill total_processed = int( output_queue.get(timeout=time_remaining(starttime))) break if args.feed_amq and not args.read_only: amq_bunch = [(id_, convert_dates_to_millisecs(dict_ad)) for id_, dict_ad in bunch] future = upload_pool.apply_async(htcondor_es.amq.post_ads, args=(amq_bunch, ), callback=_callback_amq) futures.append(("UPLOADER_AMQ", future)) if args.feed_es_for_queues and not args.read_only: es_bunch = [(id_, json.dumps(dict_ad)) for id_, dict_ad in bunch] ## FIXME: Why are we determining the index from one ad? idx = htcondor_es.es.get_index(bunch[0][1].get( "QDate", int(time.time())), template=args.es_index_template, update_es=(args.feed_es and not args.read_only)) future = upload_pool.apply_async(htcondor_es.es.post_ads_nohandle, args=(idx, es_bunch, args)) futures.append(("UPLOADER_ES", future)) max_in_progress = 3 count = len(futures) while count > max_in_progress: if time_remaining(starttime) < 0: break for future in futures: if future[1].ready(): count -= 1 if count > max_in_progress: break for future in futures: future.wait(time_remaining(starttime) + 10) break count = len(futures) listener.join() timed_out = False total_sent = 0 total_upload_time = 0 total_queried = 0 for name, future in futures: if time_remaining(starttime) > -10: try: count = future.get(time_remaining(starttime) + 10) if name == "UPLOADER_AMQ": total_sent += count[0] total_upload_time += count[2] elif name == "UPLOADER_ES": total_sent += count else: total_queried += count except multiprocessing.TimeoutError: message = "Schedd %s queue timed out; ignoring progress." % name logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) else: timed_out = True break if timed_out: pool.terminate() upload_pool.terminate() if not total_queried == total_processed: logging.warning( "Number of queried docs not equal to number of processed docs.") logging.warning( "Processing time for queues: %.2f mins, %d/%d docs sent in %.2f min " "of total upload time", (time.time() - my_start) / 60., total_sent, total_queried, total_upload_time / 60.) upload_pool.close() upload_pool.join()
def process_queues(schedd_ads, starttime, pool, args, metadata=None): """ Process all the jobs in all the schedds given. """ my_start = time.time() if time_remaining(starttime) < 10: logging.warning("No time remaining to process queues") return metadata = metadata or {} metadata["spider_source"] = "condor_queue" mp_manager = multiprocessing.Manager() input_queue = mp_manager.Queue() output_queue = mp_manager.Queue() listener = ListenAndBunch( input_queue=input_queue, output_queue=output_queue, n_expected=len(schedd_ads), starttime=starttime, bunch_size=args.amq_bunch_size, ) futures = [] upload_pool = multiprocessing.Pool(processes=args.upload_pool_size) for schedd_ad in schedd_ads: future = pool.apply_async( query_schedd_queue, args=(starttime, schedd_ad, input_queue, args) ) futures.append((schedd_ad["Name"], future)) def _callback_amq(result): sent, received, elapsed = result logging.info( "Uploaded %d/%d docs to StompAMQ in %d seconds", sent, received, elapsed ) total_processed = 0 while True: if args.dry_run or len(schedd_ads) == 0: break if time_remaining(starttime) < 5: logging.warning("Listener did not shut down properly; terminating.") listener.terminate() break bunch = output_queue.get(timeout=time_remaining(starttime)) if bunch is None: # swallow the poison pill total_processed = int(output_queue.get(timeout=time_remaining(starttime))) break if args.feed_es_for_queues and not args.read_only: ## Note that these bunches are sized according to --amq_bunch_size ## FIXME: Why are we determining the index from one ad? idx = htcondor_es.es.get_index( bunch[0][1].get("QDate", int(time.time())), template=args.es_index_template, update_es=(args.feed_es and not args.read_only), ) future = upload_pool.apply_async( htcondor_es.es.post_ads_nohandle, args=(idx, bunch, args, metadata) ) futures.append(("UPLOADER_ES", future)) if args.feed_amq and not args.read_only: amq_bunch = [ (id_, convert_dates_to_millisecs(dict_ad)) for id_, dict_ad in bunch ] future = upload_pool.apply_async( htcondor_es.amq.post_ads, args=(amq_bunch, metadata), callback=_callback_amq, ) futures.append(("UPLOADER_AMQ", future)) logging.info("Starting new uploader, %d items in queue" % output_queue.qsize()) listener.join() timed_out = False total_sent = 0 total_upload_time = 0 total_queried = 0 for name, future in futures: if time_remaining(starttime, positive=False) > -20: try: count = future.get(time_remaining(starttime) + 10) if name == "UPLOADER_AMQ": total_sent += count[0] total_upload_time += count[2] elif name == "UPLOADER_ES": total_sent += count else: try: total_queried += count except TypeError: pass except multiprocessing.TimeoutError: message = "Schedd %s queue timed out; ignoring progress." % name logging.error(message) send_email_alert( args.email_alerts, "spider_cms queue timeout warning", message ) else: timed_out = True break if timed_out: logging.error("Timed out when retrieving uploaders. Upload count incomplete.") pool.terminate() upload_pool.terminate() if not total_queried == total_processed: logging.warning("Number of queried docs not equal to number of processed docs.") logging.warning( "Processing time for queues: %.2f mins, %d/%d docs sent in %.2f min " "of total upload time", (time.time() - my_start) / 60.0, total_sent, total_queried, total_upload_time / 60.0, ) upload_pool.close() upload_pool.join()
def process_schedd_queue(starttime, schedd_ad, queue, args): my_start = time.time() logging.info("Querying %s queue for jobs.", schedd_ad["Name"]) if time_remaining(starttime) < 0: message = ("No time remaining to run queue crawler on %s; " "exiting." % schedd_ad['Name']) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) return count_since_last_report = 0 count = 0 cpu_usage = resource.getrusage(resource.RUSAGE_SELF).ru_utime queue.put(schedd_ad['Name'], timeout=time_remaining(starttime)) schedd = htcondor.Schedd(schedd_ad) sent_warnings = False batch = [] try: query_iter = schedd.xquery() if not args.dry_run else [] for job_ad in query_iter: dict_ad = None try: dict_ad = convert_to_json(job_ad, return_dict=True, reduce_data=args.reduce_running_data) except Exception as e: message = ("Failure when converting document on %s queue: %s" % (schedd_ad["Name"], str(e))) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms queue document conversion error", message) sent_warnings = True if not dict_ad: continue batch.append((job_ad["GlobalJobId"], dict_ad)) count += 1 count_since_last_report += 1 if not args.dry_run and len(batch) == args.query_queue_batch_size: if time_remaining(starttime) < 0: message = ("Queue crawler on %s has been running for " "more than %d minutes; exiting" % (schedd_ad['Name'], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms queue timeout warning", message) break queue.put(batch, timeout=time_remaining(starttime)) batch = [] if count_since_last_report >= 1000: cpu_usage_now = resource.getrusage( resource.RUSAGE_SELF).ru_utime cpu_usage = cpu_usage_now - cpu_usage processing_rate = count_since_last_report / cpu_usage cpu_usage = cpu_usage_now logging.info( "Processor for %s has processed %d jobs " "(%.1f jobs per CPU-second)", schedd_ad['Name'], count, processing_rate) count_since_last_report = 0 if batch: # send remaining docs queue.put(batch, timeout=time_remaining(starttime)) batch = [] except RuntimeError, e: logging.error("Failed to query schedd %s for jobs: %s", schedd_ad["Name"], str(e))
"(%.1f jobs per CPU-second)", schedd_ad['Name'], count, processing_rate) count_since_last_report = 0 if batch: # send remaining docs queue.put(batch, timeout=time_remaining(starttime)) batch = [] except RuntimeError, e: logging.error("Failed to query schedd %s for jobs: %s", schedd_ad["Name"], str(e)) except Exception, e: message = ("Failure when processing schedd queue query on %s: %s" % (schedd_ad["Name"], str(e))) logging.error(message) send_email_alert(args.email_alerts, "spider_cms schedd queue query error", message) traceback.print_exc() queue.put(schedd_ad['Name'], timeout=time_remaining(starttime)) total_time = (time.time() - my_start) / 60. logging.warning( "Schedd %-25s queue: response count: %5d; " "query time %.2f min; ", schedd_ad["Name"], count, total_time) return count def process_queues(schedd_ads, starttime, pool, args): """ Process all the jobs in all the schedds given. """
chkp_updater = multiprocessing.Process(target=_chkp_updater) chkp_updater.start() # Check whether one of the processes timed out and reset their last # completion checkpoint in case timed_out = False for name, future in futures: if time_remaining(starttime) > -10: try: future.get(time_remaining(starttime)+10) except multiprocessing.TimeoutError: # This implies that the checkpoint hasn't been updated message = "Schedd %s history timed out; ignoring progress." % name logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) else: timed_out = True break if timed_out: pool.terminate() checkpoint_queue.put(None) # Send a poison pill chkp_updater.join() logging.warning("Processing time for history: %.2f mins", ((time.time()-starttime)/60.))
def process_schedd(starttime, last_completion, schedd_ad, args): """ Given a schedd, process its entire set of history since last checkpoint. """ my_start = time.time() if time_remaining(starttime) < 0: message = ("No time remaining to process %s history; exiting." % schedd_ad['Name']) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) return last_completion schedd = htcondor.Schedd(schedd_ad) history_query = classad.ExprTree("EnteredCurrentStatus >= %d" % last_completion) logging.info("Querying %s for history: %s. " "%.1f minutes of ads", schedd_ad["Name"], history_query, (time.time() - last_completion) / 60.) buffered_ads = {} count = 0 total_upload = 0 sent_warnings = False if not args.read_only: if args.feed_es: es = htcondor_es.es.get_server_handle(args) try: if not args.dry_run: history_iter = schedd.history(history_query, [], 10000) else: history_iter = [] for job_ad in history_iter: dict_ad = None try: dict_ad = convert_to_json(job_ad, return_dict=True) except Exception as e: message = ( "Failure when converting document on %s history: %s" % (schedd_ad["Name"], str(e))) logging.warning(message) if not sent_warnings: send_email_alert( args.email_alerts, "spider_cms history document conversion error", message) sent_warnings = True if not dict_ad: continue idx = htcondor_es.es.get_index(job_ad["QDate"], template=args.es_index_template, update_es=(args.feed_es and not args.read_only)) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((job_ad["GlobalJobId"], dict_ad)) if len(ad_list) == args.bunching: st = time.time() if not args.read_only: if args.feed_es: data_for_es = [(id_, json.dumps(dict_ad)) for id_, dict_ad in ad_list] htcondor_es.es.post_ads(es=es.handle, idx=idx, ads=data_for_es) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \ id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq) logging.debug("...posting %d ads from %s (process_schedd)", len(ad_list), schedd_ad["Name"]) total_upload += time.time() - st buffered_ads[idx] = [] count += 1 job_completion = job_ad.get("EnteredCurrentStatus") if job_completion > last_completion: last_completion = job_completion if time_remaining(starttime) < 0: message = ("History crawler on %s has been running for " "more than %d minutes; exiting." % (schedd_ad["Name"], TIMEOUT_MINS)) logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) break except RuntimeError: logging.error("Failed to query schedd for job history: %s", schedd_ad["Name"]) except Exception as exn: message = ("Failure when processing schedd history query on %s: %s" % (schedd_ad["Name"], str(exn))) logging.exception(message) send_email_alert(args.email_alerts, "spider_cms schedd history query error", message) # Post the remaining ads for idx, ad_list in buffered_ads.items(): if ad_list: logging.debug( "...posting remaining %d ads from %s " "(process_schedd)", len(ad_list), schedd_ad["Name"]) if not args.read_only: if args.feed_es: htcondor_es.es.post_ads(es=es.handle, idx=idx, ads=[(id_, json.dumps(dict_ad)) for id_, dict_ad in ad_list]) if args.feed_amq: data_for_amq = [(id_, convert_dates_to_millisecs(dict_ad)) for \ id_, dict_ad in ad_list] htcondor_es.amq.post_ads(data_for_amq) total_time = (time.time() - my_start) / 60. total_upload /= 60. last_formatted = datetime.datetime.fromtimestamp(last_completion).strftime( "%Y-%m-%d %H:%M:%S") logging.warning( "Schedd %-25s history: response count: %5d; last completion %s; " "query time %.2f min; upload time %.2f min", schedd_ad["Name"], count, last_formatted, total_time - total_upload, total_upload) try: checkpoint_new = json.load(open("checkpoint.json")) except: checkpoint_new = {} if ((schedd_ad["Name"] not in checkpoint_new) or (checkpoint_new[schedd_ad["Name"]] < last_completion)): checkpoint_new[schedd_ad["Name"]] = last_completion fd, tmpname = tempfile.mkstemp(dir=".", prefix="checkpoint.json.new") fd = os.fdopen(fd, "w") json.dump(checkpoint_new, fd) fd.close() os.rename(tmpname, "checkpoint.json") return last_completion
def process_histories(schedd_ads, starttime, pool, args): """ Process history files for each schedd listed in a given multiprocessing pool """ try: checkpoint = json.load(open("checkpoint.json")) except: checkpoint = {} futures = [] for schedd_ad in schedd_ads: name = schedd_ad["Name"] # Check for last completion time # If there was no previous completion, get last 12 h last_completion = checkpoint.get(name, time.time() - 12 * 3600) # For CRAB, only ever get a maximum of 12 h if name.startswith( "crab") and last_completion < time.time() - 12 * 3600: last_completion = time.time() - 12 * 3600 future = pool.apply_async( process_schedd, (starttime, last_completion, schedd_ad, args)) futures.append((name, future)) # Check whether one of the processes timed out and reset their last # completion checkpoint in case timed_out = False for name, future in futures: if time_remaining(starttime) > -10: try: last_completion = future.get(time_remaining(starttime) + 10) if name: checkpoint[name] = last_completion except multiprocessing.TimeoutError: message = "Schedd %s history timed out; ignoring progress." % name logging.error(message) send_email_alert(args.email_alerts, "spider_cms history timeout warning", message) else: timed_out = True break if timed_out: pool.terminate() # Update the last completion checkpoint file try: checkpoint_new = json.load(open("checkpoint.json")) except: checkpoint_new = {} for key, val in checkpoint.items(): if (key not in checkpoint_new) or (val > checkpoint_new[key]): checkpoint_new[key] = val fd, tmpname = tempfile.mkstemp(dir=".", prefix="checkpoint.json.new") fd = os.fdopen(fd, "w") json.dump(checkpoint_new, fd) fd.close() os.rename(tmpname, "checkpoint.json") logging.warning("Processing time for history: %.2f mins", ((time.time() - starttime) / 60.))