예제 #1
0
def run_on_cluster():
    #set parameters
    year, us_indir, ps_indir, parent_dir, log_dir, map_dir = sys.argv[1:7]
    year = int(year)
    nrows = None  # number of rows of file to read; make it small if you're just testing things out

    #if NOT running the entire file (for testing), note this in name when saving
    if nrows != None:
        nrow_str = '_TEST_%d_ROWS' % nrows
    else:
        nrow_str = ''

    #get date and time info
    date_regex = re.compile('\W')
    date_unformatted = str(datetime.now().replace(microsecond=0))
    date_str = date_regex.sub('_', date_unformatted)

    rlog.open('%s/parse_fwf_%d_%s.log' % (log_dir, year, date_str))

    print_and_log('Hello, world')
    print_and_log('Initializing')

    #run code
    cod_data, cod_data_raw, cod_missingness = parse_cod_mortality(
        year, us_indir, ps_indir, map_dir, nrows)

    #save files:

    # 1. Save parsed-but-not-cleaned files:
    print_and_log('saving parsed files')
    cod_data_raw.to_csv('%s/parsed/data_%s_parsed.csv' % (parent_dir, year))

    # 2. Save missingness
    print_and_log('saving missingness data')
    cod_missingness.to_csv('%s/cleaned/missingness/missingness_info_%s.csv' %
                           (parent_dir, year))

    # 3. Save cleaned data.
    # NOTE: if the year is 1980/1981 or 1988-1991, there will be deaths misassigned to nonexistent counties or deaths assigned to 'missing' due to censorship, respectively.
    # we re-assign these deaths to real counties in the next steps (in the prep_for_redistribution folder), but until then we must save these pre-adjusted files somewhere else,
    # hence the logical tree below.
    if year in (range(1980, 1982) + range(1988, 1992)):
        print_and_log('saving pre-adjusted data to special folder')
        cod_data.to_csv(
            '%s/cleaned/pre_adjust_ak_ga_ny/data_%s_pre_adjust.csv' %
            (parent_dir, year))
    else:
        print_and_log('saving cleaned data')
        cod_data.to_csv('%s/cleaned/data_%s_cleaned.csv' % (parent_dir, year))

    print_and_log('File is parsed, cleaned, and saved!')
예제 #2
0
def run_local():

    if os.path.isdir('H:/') == True:
        j = 'J:'
        h = 'H:'
        #cod_dict, cod_raw, cod_missingness = run_local()
    elif os.path.isdir('/home/j/') == True:
        j = '/home/j'
        h = '/homes/abertozz'
    else:
        print_and_log('What am I supposed to do?')

    cod_dict = {}
    cod_raw = {}
    cod_missingness = {}

    nrows = None  # number of rows of file to read; make it small for running local jobs
    map_dir = '%s/Project/us_counties/mortality/data_prep/counties/01_clean_microdata/state_map.csv' % j

    yearvals = [1992]

    for year in yearvals:
        if year in range(1968, 1989):
            us_indir = "%s/DATA/USA/VR/%d/USA_VITAL_STATISTICS_%d_MORTALITY.TXT" % (
                j, year, year)
            ps_indir = 'NONE'
        elif year in range(1989, 1994):
            fname = get_filepaths(year)
            us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % (
                j, year, fname)
            ps_indir = 'NONE'
        else:
            fname = get_filepaths(year)
            us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % (
                j, year, fname['US'])
            ps_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % (
                j, year, fname['PS'])

        rlog.open(
            '%s/temp/amelia/counties/parse_death_files/debug_parse_%d.log' %
            (j, year))
        rlog.log('Hello, world')

        rlog.log('Initializing')

        cod_data, cod_data_raw, cod_missingness = parse_cod_mortality(
            year, us_indir, ps_indir, map_dir, nrows)

    return cod_data, cod_data_raw, cod_missingness
예제 #3
0
log_dir, year, env_id, late_id, out_dir = sys.argv[1:6]

year = int(year)
env_id = int(env_id)
late_id = int(late_id)
cause = [env_id, late_id]

# get list of locations
locations = maternal_fns.get_locations()

# set up columns we want to subset
columns = maternal_fns.filter_cols()
index_cols = [col for col in columns if not col.startswith('draw_')]

# logging
rlog.open('FILEPATH.log' % (log_dir, year))
rlog.log('')
rlog.log('Starting to get late cause fractions')

##############################################
# GET LATE CAUSE FRACTIONS:
##############################################
codcorrect_df = draws(gbd_ids={'cause_ids': [env_id, late_id]},
                      source='codcorrect', year_ids=[year], sex_ids=[2],
                      measure_ids=[1])
codcorrect_df['measure_id'] = 1
codcorrect_df = codcorrect_df[codcorrect_df.age_group_id.isin(range(7, 16))]

envelope_df = codcorrect_df[codcorrect_df.cause_id == env_id]
late_df = codcorrect_df[codcorrect_df.cause_id == late_id]
예제 #4
0
log_dir, year, dalynator_dir, env_id, late_id, out_dir = sys.argv[1:7]

year = int(year)
env_id = int(env_id)
late_id = int(late_id)
cause = [env_id, late_id]

# get list of locations
locations = maternal_fns.get_locations()

# set up columns we want to subset
columns = maternal_fns.filter_cols()

# logging
rlog.open('%s/dalynator_late_%s.log' % (log_dir, year))
rlog.log('')
rlog.log('Starting to get late cause fractions')

##############################################
# GET LATE CAUSE FRACTIONS:
##############################################
for geo in locations:
    fname = 'draws_%s_%s.h5' % (geo, year)

    # dalynator files are saved as loc/year, with age, sex and cause inside
    try:
        dalynator_df = pd.read_hdf('%s/%s/%s' % (dalynator_dir, geo, fname),
                                   'data',
                                   where=[("'cause_id'==%s & 'measure_id'==1"
                                           "& 'metric_id'==1 & 'sex_id'==2"
예제 #5
0
import json

# make dataframes less annoying
pd.set_option('display.max_columns', 5)

# create enginer
enginer = dbapis.engine_factory()

# create directory for intermediate file outputs
current_date = maternal_fns.get_time()
cluster_dir = maternal_fns.check_dir(
    '/ihme/centralcomp/maternal_mortality/%s' % current_date)

# set log structure
log_dir = maternal_fns.check_dir('%s/logs' % cluster_dir)
rlog.open('%s/master' % log_dir)

# read in dependency map
dep_map = pd.read_csv("dependency_map.csv", header=0).dropna(axis='columns',
                                                             how='all')

# set all year vals
yearvals = range(1980, 2016)

##########################################################################
# 01: SCALE FRACTIONS
# Dismod outputs cfs for every maternal subcause (except the maternal parent)
# but only for certain years. We first interpolate between years to get a
# full time series for our period of interest.
# We do this for subcauses. Next, we proportionately scale the cause
# fractions so they sum to one across subcauses. Timing scaling and
예제 #6
0
else:
    print 'Where am I supposed to go?'

##############################################
# PREP WORK:
# set directories and other preliminary data
##############################################

print 'starting job!'

log_dir, jobname, dismod_dir, cluster_dir, year = sys.argv[1:6]

year = int(year)

# logging
rlog.open('%s/%s.log' % (log_dir, jobname))
rlog.log('Starting scale fractions step')

# get list of locations
locations = maternal_fns.get_locations()
geo_length = len(locations)

# set up database
enginer = dbapis.engine_factory()

# set up columns we want to subset
columns = maternal_fns.filter_cols()

# get dependency_map
dep_map = pd.read_csv(
    "dependency_map.csv", header=0).dropna(axis='columns', how='all')