Beispiel #1
0
def run_pipeline_upload(como_dir, component, location_id):
    cv = ComoVersion(como_dir)
    cv.load_cache()

    if component == 'cause':
        upload_cause_summaries(cv, location_id)
    elif component == 'sequela':
        upload_sequela_summaries(cv, location_id)
    elif component == 'impairment':
        upload_rei_summaries(cv, location_id)
    elif component == "injuries":
        upload_inj_summaries(cv, location_id)
Beispiel #2
0
def run_pipeline_aggregate_locations(como_dir, component, year_id, sex_id,
                                     measure_id, location_set_id):
    # resume the como version with stored parameters
    cv = ComoVersion(como_dir)
    cv.load_cache()

    if component == "cause":
        agg_causes(cv, year_id, sex_id, measure_id, location_set_id)

    if component == "sequela":
        agg_sequelae(cv, year_id, sex_id, measure_id, location_set_id)

    if component == "impairment":
        agg_impairment(cv, year_id, sex_id, measure_id, location_set_id)

    if component == "injuries":
        agg_injuries(cv, year_id, sex_id, measure_id, location_set_id)
Beispiel #3
0
def run_pipeline_nonfatal(como_dir,
                          location_id=[],
                          year_id=[],
                          sex_id=[],
                          age_group_id=[],
                          measure_id=[],
                          n_processes=23,
                          n_simulants=40000,
                          *args,
                          **kwargs):
    """run the nonfatal calculation on most detailed demographics

    Args:
        como_dir (str):
        location_id (list, optional):
        year_id (list, optional):
        sex_id (list, optional):
        measure_id (list, optional):
        n_processes (int, optional):
        n_simulants (int, optional):

        *args and **kwargs are passed into the simulation as parameters
    """
    # resume the como version with stored parameters
    cv = ComoVersion(como_dir)
    cv.load_cache()

    # set up the nonfatal computation object for our demographic set
    cnf = ComputeNonfatal(cv,
                          location_id=location_id,
                          year_id=year_id,
                          sex_id=sex_id,
                          age_group_id=age_group_id,
                          measure_id=measure_id)

    # import data
    cnf.import_data(n_processes=n_processes)

    # compute all results
    cnf.compute_results(n_simulants=n_simulants,
                        n_processes=n_processes,
                        *args,
                        **kwargs)

    # write results to disk
    cnf.write_results()
Beispiel #4
0
def setup_env(como_version_id):
    global cv, simdir, pooldir, sg
    cv = ComoVersion(como_version_id)
    simdir = os.path.join(cv.root_dir, 'simulants')
    pooldir = os.path.join(cv.root_dir, 'locsims')
    try:
        os.makedirs(pooldir)
    except:
        pass
    sg = SuperGopher(
        {
            'file_pattern': 'sims_{location_id}_{year_id}_{sex_id}.h5',
            'h5_tablename': 'draws'
        }, simdir)
Beispiel #5
0
def run_como(
        como_dir=None,
        root_dir="FILEPATH",
        gbd_round_id=5,
        location_set_id=35,
        year_id=list(range(1990, 2018)),
        measure_id=[3, 5, 6],
        n_draws=1000,
        n_simulants=20000,
        components=["cause", "sequela", "injuries", "impairment"],
        change_years=[(1990, 2007), (2007, 2017), (1990, 2017)],
        agg_loc_sets=[35, 83],
        project="proj_como"):

    special_sets = set(agg_loc_sets) - set([location_set_id])
    all_sets = set(agg_loc_sets) | set([location_set_id])

    if como_dir is not None:
        cv = ComoVersion(como_dir)
        cv.load_cache()
    else:
        cv = ComoVersion.new(
            root_dir, gbd_round_id, location_set_id, year_id, measure_id,
            n_draws, components, change_years, special_sets)

    cwf = ComoWorkFlow(cv)
    cwf.add_tasks_to_dag(n_simulants=n_simulants, agg_loc_sets=all_sets)
    if cwf.run_workflow(project=project):
        all_locs = []
        for location_set_id in all_sets:
            loc_tree = loctree(location_set_id=location_set_id,
                               gbd_round_id=cv.gbd_round_id)
            all_locs.extend(loc_tree.node_ids)
        all_locs = list(set(all_locs))
        run_upload(cv, all_locs)
    else:
        raise RuntimeError("como unsuccessful")
Beispiel #6
0
                           "sex_id": dimensions.index_dim.get_level("sex_id")
                       },
                       n_processes=self.chunksize[component])


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Compute nonfatal aggregate for a year-sex-measure")
    parser.add_argument("--como_dir", type=str, help="directory of como run")
    parser.add_argument("--component",
                        type=str,
                        help="which component to aggregate")
    parser.add_argument("--year_id", type=int, help="year_id to aggregate")
    parser.add_argument("--sex_id", type=int, help="sex_id to aggregate")
    parser.add_argument("--measure_id",
                        type=int,
                        help="measure_id to aggregate")
    parser.add_argument("--location_set_version_id",
                        type=int,
                        help="location_set_version_id to aggregate")
    parser.add_argument("--redis_host",
                        type=str,
                        help="redis_host to manage concurrent I/O")
    args = parser.parse_args()

    cv = ComoVersion(args.como_dir)
    cv.load_cache()
    task = LocationAggTask(cv, args.measure_id, args.year_id, args.sex_id,
                           args.redis_host)
    task.run_task(args.location_set_version_id, args.component)
def main(root_j_dir, root_tmp_dir, date, code_dir, in_dir, out_dir, ndraws,
         demographics, task_id):
    dems = demographics.ix[(demographics["task_id"] == task_id)]
    # subset based on task id to the demographic arguments
    location_id = np.asscalar(dems["location_id"].iloc[0])
    year_id = np.asscalar(dems["year_id"].iloc[0])
    sex_id = np.asscalar(dems["sex_id"].iloc[0])

    # import hierarchies from Como
    from como.version import ComoVersion
    cv = ComoVersion("FILEPATH")
    cv.load_cache()

    # get dimensions and replace with what we are parallelizing
    # in this child script
    print "Copying dimensions from cv"
    dim = deepcopy(cv.dimensions)
    dim.index_dim.replace_level("location_id", location_id)
    dim.index_dim.replace_level("year_id", year_id)
    dim.index_dim.replace_level("sex_id", sex_id)

    print "Adding cause to dimensions"
    # add cause to dimensions
    dim.index_dim.add_level("cause_id",
                            cv.cause_restrictions.cause_id.unique().tolist())
    dim.index_dim.add_level("rei_id",
                            cv.ncode_hierarchy.rei_id.unique().tolist())

    # set the years so that we always have 2005 to calibrate
    years = list(
        set(
            cap_val(dim.index_dim.levels.year_id,
                    [1990, 1995, 2000, 2005, 2010, 2016]) + [2005]))
    print "Years"
    print years

    # get all E-N combinations to use to make square data
    codes = pd.read_csv(os.path.join(code_dir, "FILEPATH.csv"))

    # Get Incidence df
    df_inc = get_incidence(dim, cv, years, codes, ndraws=ndraws)
    df_inc_agg = compute_aggregates(df_inc, dim, cv)
    assert len(
        df_inc_agg.index
    ) == 50922, "The number of rows in the injized DF is not correct."
    assert -df_inc_agg.duplicated(subset=[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'cause_id', 'rei_id'
    ]).any(), "The id columns do not uniquely identify the observations!"
    assert -df_inc_agg.isnull().any().any(
    ), "You have null values in the prevalence DF!"
    print "Writing results for incidence"
    write_result_draws(df_inc_agg, measure_id=6, dim=dim, out_dir=out_dir)

    # Get Prevalence df
    df_prev = get_prevalence(dim, cv, years, codes, ndraws=ndraws)
    df_prev_agg = compute_aggregates(df_prev, dim, cv)
    assert len(
        df_inc_agg.index
    ) == 50922, "The number of rows in the injized DF is not correct."
    assert -df_prev_agg.duplicated(subset=[
        'location_id', 'year_id', 'age_group_id', 'sex_id', 'measure_id',
        'cause_id', 'rei_id'
    ]).any(), "The id columns do not uniquely identify the observations!"
    assert -df_prev_agg.isnull().any().any(
    ), "You have null values in the prevalence DF!"
    print "Writing results for prevalence"
    write_result_draws(df_prev_agg, measure_id=5, dim=dim, out_dir=out_dir)

    # save checkfile when all done
    checkpath = os.path.join(
        "FILEPATH", "finished_{}_{}_{}.txt".format(location_id, year_id,
                                                   sex_id))
    tmp = open(checkpath, 'wb')
    tmp.close()
Beispiel #8
0
def run_pipeline_summarize(como_dir, component, location_id):
    cv = ComoVersion(como_dir)
    cv.load_cache()
    summ(cv, location_id, component)
Beispiel #9
0
def run_pipeline_como(
        root_dir,
        gbd_round_id=4,
        location_id=[],
        year_id=[],
        sex_id=[],
        age_group_id=[],
        measure_id=[],
        n_draws=1000,
        n_simulants=20000,
        components=["sequela", "cause", "impairment", "injuries"]):

    cv = ComoVersion.new(root_dir, gbd_round_id, location_id, year_id, sex_id,
                         age_group_id, measure_id, n_draws, components)

    try:
        cjm = CentralJobMonitor(cv.como_dir, persistent=False)
        time.sleep(5)
    except Exception as e:
        raise e
    else:
        executor_params = {"request_timeout": 10000}
        jobq = JobQueue(cv.como_dir,
                        scheduler=RetryScheduler,
                        executor=SGEExecutor,
                        executor_params=executor_params)

        # run nonfatal pipeline by location/year/sex
        parallelism = ["location_id", "sex_id"]
        for slices in cv.dimensions.index_slices(parallelism):
            jobname = "como_e_sim_{location_id}_{sex_id}".format(
                location_id=slices[0], sex_id=slices[1])
            job = jobq.create_job(
                jobname=jobname,
                runfile=true_path(executable="compute_nonfatal"),
                parameters=[
                    "--como_dir", cv.como_dir, "--location_id",
                    str(slices[0]), "--sex_id",
                    str(slices[1]), "--n_processes", "23", "--n_simulants",
                    str(n_simulants)
                ])
            jobq.queue_job(job,
                           slots=50,
                           memory=400,
                           project="proj_como",
                           process_timeout=(60 * 180))
        jobq.block_till_done(stop_scheduler_when_done=False)

        # run aggregation by year/sex/measure
        parallelism = ["year_id", "sex_id", "measure_id"]
        for slices in cv.dimensions.index_slices(parallelism):
            for component in cv.components:
                if component != "sequela":
                    loc_sets = [35, 40]
                else:
                    loc_sets = [35]
                for location_set_id in loc_sets:
                    jobname = ("como_e_agg_{component}_{year_id}_{sex_id}"
                               "_{measure_id}_{location_set_id}").format(
                                   component=component,
                                   year_id=slices[0],
                                   sex_id=slices[1],
                                   measure_id=slices[2],
                                   location_set_id=location_set_id)
                    job = jobq.create_job(
                        jobname=jobname,
                        runfile=true_path(executable="aggregate_nonfatal"),
                        parameters=[
                            "--como_dir", cv.como_dir, "--component",
                            component, "--year_id",
                            str(slices[0]), "--sex_id",
                            str(slices[1]), "--measure_id",
                            str(slices[2]), "--location_set_id",
                            str(location_set_id)
                        ])
                    jobq.queue_job(job,
                                   slots=25,
                                   memory=200,
                                   project="proj_como",
                                   process_timeout=(60 * 600))
        jobq.block_till_done(stop_scheduler_when_done=False)

        # run summaries by component/location
        lt = dbtrees.loctree(None, 35)
        sdi_lts = dbtrees.loctree(None, 40, return_many=True)
        locs = [l.id for l in lt.nodes]
        sdi_locs = [l.root.id for l in sdi_lts]
        for component in cv.components:
            if component != "sequela":
                summ_locs = locs + sdi_locs
            else:
                summ_locs = locs[:]
            for location_id in summ_locs:
                jobname = "como_e_summ_{component}_{location_id}".format(
                    component=component, location_id=location_id)
                job = jobq.create_job(
                    jobname=jobname,
                    runfile=true_path(executable="summarize_nonfatal"),
                    parameters=[
                        "--como_dir", cv.como_dir, "--component", component,
                        "--location_id",
                        str(location_id)
                    ])
                jobq.queue_job(job,
                               slots=48,
                               memory=96,
                               project="proj_como",
                               process_timeout=(60 * 240))
        jobq.block_till_done(stop_scheduler_when_done=False)

        for component in cv.components:
            jobname = "como_e_upload_{component}".format(component=component)
            job = jobq.create_job(
                jobname=jobname,
                runfile=true_path(executable="upload_nonfatal"),
                parameters=[
                    "--como_dir", cv.como_dir, "--component", component,
                    "--location_id",
                    " ".join([str(l) for l in locs + sdi_locs])
                ])
            jobq.queue_job(job,
                           slots=20,
                           memory=40,
                           project="proj_como",
                           process_timeout=(60 * 720))
        jobq.block_till_done()

    finally:
        cjm.generate_report()
        cjm.stop_responder()
        cjm.stop_publisher()