def main(): ''' Parses command line to start jobmon and run the dismod cascade. ''' args = parse_args() info_level = 1 dirs = prepare_directories(args.mvid, create_directories=False) logging_filepath = '%s/%s' % ( dirs['model_logdir'], f'{args.mvid}_driver.log') setup_logger( logging_filepath, level=args.quiet - args.verbose + info_level) log = logging.getLogger(__name__) log.debug("main started") setup_io_patches(args.no_upload) mvid = args.mvid add_arguments = inverse_parser(args) wf_status = None try: upload.update_model_status(mvid, upload.RUNNING) try: commit_hash = sge.get_commit_hash( dir='%s/..' % drill.this_path) except subprocess.CalledProcessError: commit_hash = __version__ upload.set_commit_hash(mvid, commit_hash) driver = Driver(mvid) wf = driver.build_jobmon_workflow( identifier=args.workflow, extra_arguments=add_arguments ) wf_status = wf.execute() except Exception: log.exception("error in main driver with args {}".format( str(args))) upload.update_model_status(mvid, upload.FAILED) if wf_status != DagExecutionStatus.SUCCEEDED: upload.update_model_status(mvid, upload.FAILED)
def main(): """ Parses command line to launch the initial global dismod model """ args = parse_args() dirs = prepare_directories(args.mvid, create_directories=False) logging_filepath = '%s/%s' % ( dirs['model_logdir'], f'{args.mvid}_{args.cv_iter}_global.log') setup_logger( logging_filepath, level=args.quiet - args.verbose) log = logging.getLogger(__name__) log.debug("main started") setup_io_patches(args.no_upload) try: decomp_method = get_decomp_method_from_mv_and_flags(args.mvid, args) global_model = GlobalCascade( args.mvid, args.cv_iter, args.no_upload, decomp_method[Methods.DISABLE_NON_STANDARD_LOCATIONS]) global_model.run_global(feature_flags=args) except Exception as e: log.exception("error in main run_global with args {}".format( str(args))) if args.pdb: # This invokes a live pdb debug session when an uncaught # exception makes it here. import pdb import traceback traceback.print_exc() pdb.post_mortem() elif check_error_msg_for_sigkill(str(e)): log.error('Found error SIGKILL:9, assuming kernel failed from ' 'memory overages, returning code 137.') sys.exit(137) else: raise e
def main(): ''' Main entry point to launching a dismod model via Epi-Viz. Reads model_version_id from command line arguments, creates directories, and qsubs cascade job''' args = parse_args() try: default_debug_level = -1 dirs = prepare_directories(args.mvid) setup_io_patches(args.no_upload) meid = get_meid(args.mvid) if meid in [9422, 7695, 1175, 10352, 9309]: project = "proj_tb" else: project = "proj_dismod" logging_filepath = '%s/%s' % ( dirs['model_logdir'], f'{args.mvid}_run_all.log') setup_logger( logging_filepath, level=args.quiet - args.verbose + default_debug_level) add_arguments = inverse_parser(args) if args.workflow: driver_arguments = ["--workflow", args.workflow] else: driver_arguments = list() submit_driver( args.mvid, project, dirs, add_arguments, driver_arguments) except Exception: if args.pdb: # This invokes a live pdb debug session when an uncaught # exception makes it here. import pdb import traceback traceback.print_exc() pdb.post_mortem() else: raise
def main(): '''Set commit hash, upload model, try to write effects_plots pdfs, aggregate model version draws up location hierarchy ''' args = parse_args() mvid = args.mvid default_debug_level = -1 dirs = prepare_directories(mvid, create_directories=False) logging_filepath = '%s/%s' % (dirs['model_logdir'], f'{args.mvid}_varnish.log') setup_logger(logging_filepath, level=args.quiet - args.verbose + default_debug_level) log = logging.getLogger(__name__) log.info("Varnish started for mvid {}".format(mvid)) setup_io_patches(args.no_upload) try: try: commit_hash = get_commit_hash(dir='%s/..' % drill.this_path) except subprocess.CalledProcessError: # in site-packages, not git repo commit_hash = __version__ upload.set_commit_hash(mvid, commit_hash) upload.upload_model(mvid) outdir = "%s/%s/full" % (drill.settings['cascade_ode_out_dir'], str(mvid)) joutdir = "%s/%s" % (drill.settings['diag_out_dir'], mvid) fit_df = fit_stats.write_fit_stats(mvid, outdir, joutdir) if fit_df is not None: try: upload.upload_fit_stat(mvid) except sqlalchemy.exc.IntegrityError: log.warning("fit stat already uploaded -- skipping") else: log.warning("No fit stats computed") # Write effect PDFs plotter = "{}/effect_plots.r".format(drill.this_path) plotter = os.path.realpath(plotter) demo = Demographics(mvid) try: subprocess.check_output([ "Rscript", plotter, str(mvid), joutdir, drill.settings['cascade_ode_out_dir'], str(max(demo.year_ids)) ], stderr=subprocess.STDOUT) except subprocess.CalledProcessError: log.exception("Error in effect plots") # Clean aggregations to ensure idempotentcy decomp_step = decomp_step_from_decomp_step_id( importer.get_model_version(mvid).decomp_step_id.unique()[0]) clean_model_directory(outdir, demo.gbd_round_id, decomp_step) # Launch final aggregations log.info("Starting Save Results") aggregate_model(mvid, demo=demo, no_upload=args.no_upload) except Exception: log.exception("Error in varnish") raise
def run_location(args): '''Meant to be called in parallel using multiprocessing. Run dismod. Args: args (tuple): The arguments are packed, but there are two, a proxy to a dictionary and this location ID. Returns: Tuple of location_id and either a string error message or integer 0, representing no error ''' shared, loc_id = args gc.collect() sex_id = shared["sex_id"] year = shared["year"] full_timespan = shared["full_timespan"] args = shared["args"] cascade = shared["cascade"] cl_parent = shared["cl_parent"] # Each subprocess has its own imports, so patches need to be redone. setup_io_patches(args.no_upload) if args.debug: if full_timespan: cl = Cascade_loc(loc_id, sex_id, year, cascade, timespan=50, parent_loc=cl_parent, feature_flags=args) else: cl = Cascade_loc(loc_id, sex_id, year, cascade, parent_loc=cl_parent, feature_flags=args) cl.initialize() cl.run_dismod() cl.summarize_posterior() cl.draw() cl.predict() return loc_id, 0 else: try: if full_timespan: cl = Cascade_loc(loc_id, sex_id, year, cascade, timespan=50, parent_loc=cl_parent, feature_flags=args) else: cl = Cascade_loc(loc_id, sex_id, year, cascade, parent_loc=cl_parent, feature_flags=args) cl.initialize() cl.run_dismod() cl.summarize_posterior() cl.draw() cl.predict() return loc_id, 0 except Exception as e: logging.exception("Failure running location {}".format(loc_id)) return loc_id, str(e)
def main(): '''Read command line arguments to run dismod for all child location ids of given location ids. ''' args = parse_args() mvid = args.mvid location_id = args.location_id sex = args.sex y = args.year_id cv_iter = args.cv_iter if args.debug not in {None, "debug"}: raise AttributeError( f"Debug flag should be off or 'debug' but is {args.debug}.") dirs = prepare_directories(mvid, create_directories=False) logging_filepath = '%s/%s' % ( dirs['model_logdir'], f'{mvid}_{location_id}_{sex}_{y}_{cv_iter}_child.log') setup_logger(logging_filepath, level=args.quiet - args.verbose) log = logging.getLogger(__name__) log.info( "Starting cascade mvid {} loc {} sex {} year {} cv_iter {}".format( mvid, location_id, sex, y, cv_iter)) setup_io_patches(args.no_upload) sex_dict = {'male': 0.5, 'female': -0.5} sex_id = sex_dict[sex] log.info("Creating cascade") cascade = Cascade(mvid, reimport=False, cv_iter=cv_iter, feature_flags=args) log.info("Done with cascade") year_split_lvl = cascade.model_version_meta.fix_year.values[0] - 1 lt = cascade.loctree this_lvl = lt.get_nodelvl_by_id(location_id) log.info("Generating cascade loc") if location_id == 1: cl_parent = Cascade_loc(location_id, 0, 2000, cascade, reimport=False, feature_flags=args) else: cl_parent = Cascade_loc(location_id, sex_id, y, cascade, reimport=False, feature_flags=args) cl_parent.initialize() log.info("Done generating cascade loc") full_timespan = this_lvl < (year_split_lvl - 1) # Run child locations arglist = [] for child_loc in lt.get_node_by_id(location_id).children: arglist.append(child_loc.id) shared_to_children = dict( cascade=cascade, cl_parent=cl_parent, sex_id=sex_id, year=y, full_timespan=full_timespan, args=args, ) if args.debug: '..... RUNNING IN SINGLE PROCESS DEBUG MODE .....' try: res = [ run_location((shared_to_children, work)) for work in arglist ] except Exception: res = list() if args.pdb: # This invokes a live pdb debug session when an uncaught # exception makes it here. import pdb import traceback traceback.print_exc() pdb.post_mortem() else: raise else: res = distribute(run_location, shared_to_children, arglist) log.info("Done running") try: errors = ['%s: %s' % (str(r[0]), r[1]) for r in res if r[1] != 0] except concurrent.futures.process.BrokenProcessPool: log.error( ("Process pool died abruptly. Assuming sigkill due to OOM killer." " Returning exit code 137 for jobmon resource retry")) sys.exit(137) if len(errors) == 0: log.info("No errors found") else: num_errors = len(errors) error_msg = "; ".join(errors) log.error("Found {} errors for mvid {} loc {} sex {} year {} cv_iter" "{}: {}".format(num_errors, mvid, location_id, sex, y, cv_iter, error_msg)) if check_error_msg_for_sigkill(error_msg): log.error( '"Signals.SIGKILL: 9" found in error_msg. Returning exit code ' ' 137 for jobmon resource retry.') sys.exit(137) else: raise RuntimeError('Dismod kernel failures.')