Esempio n. 1
0
def main():
    '''
    Parses command line to start jobmon and run the dismod cascade.
    '''
    args = parse_args()
    info_level = 1
    dirs = prepare_directories(args.mvid, create_directories=False)
    logging_filepath = '%s/%s' % (
        dirs['model_logdir'], f'{args.mvid}_driver.log')
    setup_logger(
        logging_filepath, level=args.quiet - args.verbose + info_level)
    log = logging.getLogger(__name__)
    log.debug("main started")
    setup_io_patches(args.no_upload)

    mvid = args.mvid
    add_arguments = inverse_parser(args)

    wf_status = None

    try:
        upload.update_model_status(mvid, upload.RUNNING)
        try:
            commit_hash = sge.get_commit_hash(
                dir='%s/..' % drill.this_path)
        except subprocess.CalledProcessError:
            commit_hash = __version__
        upload.set_commit_hash(mvid, commit_hash)
        driver = Driver(mvid)
        wf = driver.build_jobmon_workflow(
            identifier=args.workflow, extra_arguments=add_arguments
        )
        wf_status = wf.execute()
    except Exception:
        log.exception("error in main driver with args {}".format(
            str(args)))
        upload.update_model_status(mvid, upload.FAILED)

    if wf_status != DagExecutionStatus.SUCCEEDED:
        upload.update_model_status(mvid, upload.FAILED)
Esempio n. 2
0
def main():
    """
    Parses command line to launch the initial global dismod model
    """
    args = parse_args()
    dirs = prepare_directories(args.mvid, create_directories=False)
    logging_filepath = '%s/%s' % (
        dirs['model_logdir'], f'{args.mvid}_{args.cv_iter}_global.log')
    setup_logger(
        logging_filepath,
        level=args.quiet - args.verbose)
    log = logging.getLogger(__name__)
    log.debug("main started")

    setup_io_patches(args.no_upload)

    try:
        decomp_method = get_decomp_method_from_mv_and_flags(args.mvid, args)
        global_model = GlobalCascade(
            args.mvid, args.cv_iter, args.no_upload,
            decomp_method[Methods.DISABLE_NON_STANDARD_LOCATIONS])
        global_model.run_global(feature_flags=args)
    except Exception as e:
        log.exception("error in main run_global with args {}".format(
            str(args)))
        if args.pdb:
            # This invokes a live pdb debug session when an uncaught
            # exception makes it here.
            import pdb
            import traceback

            traceback.print_exc()
            pdb.post_mortem()
        elif check_error_msg_for_sigkill(str(e)):
            log.error('Found error SIGKILL:9, assuming kernel failed from '
                      'memory overages, returning code 137.')
            sys.exit(137)
        else:
            raise e
Esempio n. 3
0
def main():
    ''' Main entry point to launching a dismod model via Epi-Viz. Reads
    model_version_id from command line arguments, creates directories, and
    qsubs cascade job'''
    args = parse_args()
    try:
        default_debug_level = -1
        dirs = prepare_directories(args.mvid)
        setup_io_patches(args.no_upload)
        meid = get_meid(args.mvid)
        if meid in [9422, 7695, 1175, 10352, 9309]:
            project = "proj_tb"
        else:
            project = "proj_dismod"

        logging_filepath = '%s/%s' % (
            dirs['model_logdir'], f'{args.mvid}_run_all.log')
        setup_logger(
            logging_filepath,
            level=args.quiet - args.verbose + default_debug_level)

        add_arguments = inverse_parser(args)
        if args.workflow:
            driver_arguments = ["--workflow", args.workflow]
        else:
            driver_arguments = list()
        submit_driver(
            args.mvid, project, dirs, add_arguments, driver_arguments)
    except Exception:
        if args.pdb:
            # This invokes a live pdb debug session when an uncaught
            # exception makes it here.
            import pdb
            import traceback

            traceback.print_exc()
            pdb.post_mortem()
        else:
            raise
Esempio n. 4
0
def main():
    '''Set commit hash, upload model, try to write effects_plots pdfs,
    aggregate model version draws up location hierarchy
    '''
    args = parse_args()
    mvid = args.mvid
    default_debug_level = -1
    dirs = prepare_directories(mvid, create_directories=False)
    logging_filepath = '%s/%s' % (dirs['model_logdir'],
                                  f'{args.mvid}_varnish.log')
    setup_logger(logging_filepath,
                 level=args.quiet - args.verbose + default_debug_level)

    log = logging.getLogger(__name__)
    log.info("Varnish started for mvid {}".format(mvid))
    setup_io_patches(args.no_upload)

    try:
        try:
            commit_hash = get_commit_hash(dir='%s/..' % drill.this_path)
        except subprocess.CalledProcessError:
            # in site-packages, not git repo
            commit_hash = __version__

        upload.set_commit_hash(mvid, commit_hash)
        upload.upload_model(mvid)

        outdir = "%s/%s/full" % (drill.settings['cascade_ode_out_dir'],
                                 str(mvid))
        joutdir = "%s/%s" % (drill.settings['diag_out_dir'], mvid)
        fit_df = fit_stats.write_fit_stats(mvid, outdir, joutdir)
        if fit_df is not None:
            try:
                upload.upload_fit_stat(mvid)
            except sqlalchemy.exc.IntegrityError:
                log.warning("fit stat already uploaded -- skipping")
        else:
            log.warning("No fit stats computed")

        # Write effect PDFs
        plotter = "{}/effect_plots.r".format(drill.this_path)
        plotter = os.path.realpath(plotter)

        demo = Demographics(mvid)
        try:
            subprocess.check_output([
                "Rscript", plotter,
                str(mvid), joutdir, drill.settings['cascade_ode_out_dir'],
                str(max(demo.year_ids))
            ],
                                    stderr=subprocess.STDOUT)
        except subprocess.CalledProcessError:
            log.exception("Error in effect plots")

        # Clean aggregations to ensure idempotentcy
        decomp_step = decomp_step_from_decomp_step_id(
            importer.get_model_version(mvid).decomp_step_id.unique()[0])
        clean_model_directory(outdir, demo.gbd_round_id, decomp_step)

        # Launch final aggregations
        log.info("Starting Save Results")
        aggregate_model(mvid, demo=demo, no_upload=args.no_upload)
    except Exception:
        log.exception("Error in varnish")
        raise
Esempio n. 5
0
def run_location(args):
    '''Meant to be called in parallel using multiprocessing. Run
    dismod.

    Args:
        args (tuple): The arguments are packed, but there are two,
            a proxy to a dictionary and this location ID.

    Returns:
        Tuple of location_id and either a string error message or integer 0,
        representing no error
    '''
    shared, loc_id = args
    gc.collect()
    sex_id = shared["sex_id"]
    year = shared["year"]
    full_timespan = shared["full_timespan"]
    args = shared["args"]
    cascade = shared["cascade"]
    cl_parent = shared["cl_parent"]

    # Each subprocess has its own imports, so patches need to be redone.
    setup_io_patches(args.no_upload)
    if args.debug:
        if full_timespan:
            cl = Cascade_loc(loc_id,
                             sex_id,
                             year,
                             cascade,
                             timespan=50,
                             parent_loc=cl_parent,
                             feature_flags=args)
        else:
            cl = Cascade_loc(loc_id,
                             sex_id,
                             year,
                             cascade,
                             parent_loc=cl_parent,
                             feature_flags=args)
        cl.initialize()
        cl.run_dismod()
        cl.summarize_posterior()
        cl.draw()
        cl.predict()
        return loc_id, 0
    else:
        try:
            if full_timespan:
                cl = Cascade_loc(loc_id,
                                 sex_id,
                                 year,
                                 cascade,
                                 timespan=50,
                                 parent_loc=cl_parent,
                                 feature_flags=args)
            else:
                cl = Cascade_loc(loc_id,
                                 sex_id,
                                 year,
                                 cascade,
                                 parent_loc=cl_parent,
                                 feature_flags=args)
            cl.initialize()
            cl.run_dismod()
            cl.summarize_posterior()
            cl.draw()
            cl.predict()
            return loc_id, 0
        except Exception as e:
            logging.exception("Failure running location {}".format(loc_id))
            return loc_id, str(e)
Esempio n. 6
0
def main():
    '''Read command line arguments to run dismod for all child location ids of
    given location ids.
    '''
    args = parse_args()
    mvid = args.mvid
    location_id = args.location_id
    sex = args.sex
    y = args.year_id
    cv_iter = args.cv_iter
    if args.debug not in {None, "debug"}:
        raise AttributeError(
            f"Debug flag should be off or 'debug' but is {args.debug}.")

    dirs = prepare_directories(mvid, create_directories=False)
    logging_filepath = '%s/%s' % (
        dirs['model_logdir'],
        f'{mvid}_{location_id}_{sex}_{y}_{cv_iter}_child.log')
    setup_logger(logging_filepath, level=args.quiet - args.verbose)
    log = logging.getLogger(__name__)
    log.info(
        "Starting cascade mvid {} loc {} sex {} year {} cv_iter {}".format(
            mvid, location_id, sex, y, cv_iter))

    setup_io_patches(args.no_upload)

    sex_dict = {'male': 0.5, 'female': -0.5}
    sex_id = sex_dict[sex]

    log.info("Creating cascade")
    cascade = Cascade(mvid,
                      reimport=False,
                      cv_iter=cv_iter,
                      feature_flags=args)
    log.info("Done with cascade")

    year_split_lvl = cascade.model_version_meta.fix_year.values[0] - 1
    lt = cascade.loctree
    this_lvl = lt.get_nodelvl_by_id(location_id)
    log.info("Generating cascade loc")
    if location_id == 1:
        cl_parent = Cascade_loc(location_id,
                                0,
                                2000,
                                cascade,
                                reimport=False,
                                feature_flags=args)
    else:
        cl_parent = Cascade_loc(location_id,
                                sex_id,
                                y,
                                cascade,
                                reimport=False,
                                feature_flags=args)
    cl_parent.initialize()
    log.info("Done generating cascade loc")

    full_timespan = this_lvl < (year_split_lvl - 1)

    # Run child locations
    arglist = []
    for child_loc in lt.get_node_by_id(location_id).children:
        arglist.append(child_loc.id)

    shared_to_children = dict(
        cascade=cascade,
        cl_parent=cl_parent,
        sex_id=sex_id,
        year=y,
        full_timespan=full_timespan,
        args=args,
    )

    if args.debug:
        '..... RUNNING IN SINGLE PROCESS DEBUG MODE .....'
        try:
            res = [
                run_location((shared_to_children, work)) for work in arglist
            ]
        except Exception:
            res = list()
            if args.pdb:
                # This invokes a live pdb debug session when an uncaught
                # exception makes it here.
                import pdb
                import traceback

                traceback.print_exc()
                pdb.post_mortem()
            else:
                raise
    else:
        res = distribute(run_location, shared_to_children, arglist)
        log.info("Done running")

    try:
        errors = ['%s: %s' % (str(r[0]), r[1]) for r in res if r[1] != 0]
    except concurrent.futures.process.BrokenProcessPool:
        log.error(
            ("Process pool died abruptly. Assuming sigkill due to OOM killer."
             " Returning exit code 137 for jobmon resource retry"))
        sys.exit(137)

    if len(errors) == 0:
        log.info("No errors found")
    else:
        num_errors = len(errors)
        error_msg = "; ".join(errors)
        log.error("Found {} errors for mvid {} loc {} sex {} year {} cv_iter"
                  "{}: {}".format(num_errors, mvid, location_id, sex, y,
                                  cv_iter, error_msg))
        if check_error_msg_for_sigkill(error_msg):
            log.error(
                '"Signals.SIGKILL: 9" found in error_msg. Returning exit code '
                ' 137 for jobmon resource retry.')
            sys.exit(137)
        else:
            raise RuntimeError('Dismod kernel failures.')