def run_on_cluster(): #set parameters year, us_indir, ps_indir, parent_dir, log_dir, map_dir = sys.argv[1:7] year = int(year) nrows = None # number of rows of file to read; make it small if you're just testing things out #if NOT running the entire file (for testing), note this in name when saving if nrows != None: nrow_str = '_TEST_%d_ROWS' % nrows else: nrow_str = '' #get date and time info date_regex = re.compile('\W') date_unformatted = str(datetime.now().replace(microsecond=0)) date_str = date_regex.sub('_', date_unformatted) rlog.open('%s/parse_fwf_%d_%s.log' % (log_dir, year, date_str)) print_and_log('Hello, world') print_and_log('Initializing') #run code cod_data, cod_data_raw, cod_missingness = parse_cod_mortality( year, us_indir, ps_indir, map_dir, nrows) #save files: # 1. Save parsed-but-not-cleaned files: print_and_log('saving parsed files') cod_data_raw.to_csv('%s/parsed/data_%s_parsed.csv' % (parent_dir, year)) # 2. Save missingness print_and_log('saving missingness data') cod_missingness.to_csv('%s/cleaned/missingness/missingness_info_%s.csv' % (parent_dir, year)) # 3. Save cleaned data. # NOTE: if the year is 1980/1981 or 1988-1991, there will be deaths misassigned to nonexistent counties or deaths assigned to 'missing' due to censorship, respectively. # we re-assign these deaths to real counties in the next steps (in the prep_for_redistribution folder), but until then we must save these pre-adjusted files somewhere else, # hence the logical tree below. if year in (range(1980, 1982) + range(1988, 1992)): print_and_log('saving pre-adjusted data to special folder') cod_data.to_csv( '%s/cleaned/pre_adjust_ak_ga_ny/data_%s_pre_adjust.csv' % (parent_dir, year)) else: print_and_log('saving cleaned data') cod_data.to_csv('%s/cleaned/data_%s_cleaned.csv' % (parent_dir, year)) print_and_log('File is parsed, cleaned, and saved!')
def run_local(): if os.path.isdir('H:/') == True: j = 'J:' h = 'H:' #cod_dict, cod_raw, cod_missingness = run_local() elif os.path.isdir('/home/j/') == True: j = '/home/j' h = '/homes/abertozz' else: print_and_log('What am I supposed to do?') cod_dict = {} cod_raw = {} cod_missingness = {} nrows = None # number of rows of file to read; make it small for running local jobs map_dir = '%s/Project/us_counties/mortality/data_prep/counties/01_clean_microdata/state_map.csv' % j yearvals = [1992] for year in yearvals: if year in range(1968, 1989): us_indir = "%s/DATA/USA/VR/%d/USA_VITAL_STATISTICS_%d_MORTALITY.TXT" % ( j, year, year) ps_indir = 'NONE' elif year in range(1989, 1994): fname = get_filepaths(year) us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % ( j, year, fname) ps_indir = 'NONE' else: fname = get_filepaths(year) us_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % ( j, year, fname['US']) ps_indir = '%s/LIMITED_USE/PROJECT_FOLDERS/USA/NVSS_MORTALITY/%d/%s' % ( j, year, fname['PS']) rlog.open( '%s/temp/amelia/counties/parse_death_files/debug_parse_%d.log' % (j, year)) rlog.log('Hello, world') rlog.log('Initializing') cod_data, cod_data_raw, cod_missingness = parse_cod_mortality( year, us_indir, ps_indir, map_dir, nrows) return cod_data, cod_data_raw, cod_missingness
log_dir, year, env_id, late_id, out_dir = sys.argv[1:6] year = int(year) env_id = int(env_id) late_id = int(late_id) cause = [env_id, late_id] # get list of locations locations = maternal_fns.get_locations() # set up columns we want to subset columns = maternal_fns.filter_cols() index_cols = [col for col in columns if not col.startswith('draw_')] # logging rlog.open('FILEPATH.log' % (log_dir, year)) rlog.log('') rlog.log('Starting to get late cause fractions') ############################################## # GET LATE CAUSE FRACTIONS: ############################################## codcorrect_df = draws(gbd_ids={'cause_ids': [env_id, late_id]}, source='codcorrect', year_ids=[year], sex_ids=[2], measure_ids=[1]) codcorrect_df['measure_id'] = 1 codcorrect_df = codcorrect_df[codcorrect_df.age_group_id.isin(range(7, 16))] envelope_df = codcorrect_df[codcorrect_df.cause_id == env_id] late_df = codcorrect_df[codcorrect_df.cause_id == late_id]
log_dir, year, dalynator_dir, env_id, late_id, out_dir = sys.argv[1:7] year = int(year) env_id = int(env_id) late_id = int(late_id) cause = [env_id, late_id] # get list of locations locations = maternal_fns.get_locations() # set up columns we want to subset columns = maternal_fns.filter_cols() # logging rlog.open('%s/dalynator_late_%s.log' % (log_dir, year)) rlog.log('') rlog.log('Starting to get late cause fractions') ############################################## # GET LATE CAUSE FRACTIONS: ############################################## for geo in locations: fname = 'draws_%s_%s.h5' % (geo, year) # dalynator files are saved as loc/year, with age, sex and cause inside try: dalynator_df = pd.read_hdf('%s/%s/%s' % (dalynator_dir, geo, fname), 'data', where=[("'cause_id'==%s & 'measure_id'==1" "& 'metric_id'==1 & 'sex_id'==2"
import json # make dataframes less annoying pd.set_option('display.max_columns', 5) # create enginer enginer = dbapis.engine_factory() # create directory for intermediate file outputs current_date = maternal_fns.get_time() cluster_dir = maternal_fns.check_dir( '/ihme/centralcomp/maternal_mortality/%s' % current_date) # set log structure log_dir = maternal_fns.check_dir('%s/logs' % cluster_dir) rlog.open('%s/master' % log_dir) # read in dependency map dep_map = pd.read_csv("dependency_map.csv", header=0).dropna(axis='columns', how='all') # set all year vals yearvals = range(1980, 2016) ########################################################################## # 01: SCALE FRACTIONS # Dismod outputs cfs for every maternal subcause (except the maternal parent) # but only for certain years. We first interpolate between years to get a # full time series for our period of interest. # We do this for subcauses. Next, we proportionately scale the cause # fractions so they sum to one across subcauses. Timing scaling and
else: print 'Where am I supposed to go?' ############################################## # PREP WORK: # set directories and other preliminary data ############################################## print 'starting job!' log_dir, jobname, dismod_dir, cluster_dir, year = sys.argv[1:6] year = int(year) # logging rlog.open('%s/%s.log' % (log_dir, jobname)) rlog.log('Starting scale fractions step') # get list of locations locations = maternal_fns.get_locations() geo_length = len(locations) # set up database enginer = dbapis.engine_factory() # set up columns we want to subset columns = maternal_fns.filter_cols() # get dependency_map dep_map = pd.read_csv( "dependency_map.csv", header=0).dropna(axis='columns', how='all')