Ejemplo n.º 1
0
def adjust_data(bundle, cause_name):

    #delete all data to start over if needed, comment out if not
    bundle = int(bundle)
    #If export=True, save an excel file for the data retrieved in the bundle's download folder.
    df = run.get_epi_data(bundle, export=True)
    df = df[['bundle_id', 'seq']]
    destination_file = "{FILEPATH}"
    df.to_excel(destination_file, index=False, sheet_name="extraction")
    report = run.upload_epi_data(bundle, destination_file)
    assert (report['request_status'].item() == 'Successful')

    # grab latest download if needed, comment out if not
    #download_path = "{FILEPATH}"
    #allFiles = glob.glob(os.path.join(download_path, "*.xlsx"))
    #request_list = []
    #for request in allFiles:
    #   underscore_index = request.rfind('_')
    #   file_ext_index= request.rfind('.')
    #   request_list.append(request[underscore_index+1:file_ext_index])
    #convert strings to int or the sort will be incorrect
    #request_list = map(int, request_list)
    #request_list.sort(reverse=True)
    #fname = "request_{}.xlsx".format(request_list[0])
    #print fname

    #Grab the recovery file if needed, comment out if not
    download_path = "{FILEPATH}"
    print fname
    re_up = pd.read_excel(os.path.join(download_path, fname),
                          header=0,
                          sheet="extraction")
    re_up['response_rate'] = None

    #reupload so we can start from the beginning
    destination_file = "{FILEPATH}"
    print destination_file
    re_up.to_excel(destination_file, index=False, sheet_name="extraction")
    report = run.upload_epi_data(bundle, destination_file)
    assert (report['request_status'].item() == 'Successful')
    report_file = "{FILEPATH}"
    report.to_csv(report_file, encoding='utf-8')

    #df = pd.read_excel(os.path.join(download_path, fname), header=0, sheet="extraction") #use this for testing then comment out
    df = run.get_epi_data(bundle)
    hospital_data(df, bundle, cause_name)
    df = run.get_epi_data(bundle)
    assert (df.duplicated(subset=df.iloc[:, 2:], keep=False).any()) == False
    marketscan_data(df, bundle, cause_name)
    df = run.get_epi_data(bundle)
    other_data(df, bundle, cause_name)
Ejemplo n.º 2
0
def other_data(df, bundle, cause_name):
    print "Other data!"

    metadata_set = 35
    round_id = 4

    meta = get_location_metadata(location_set_id=metadata_set, gbd_round_id=4)
    meta = meta[['location_id', 'parent_id']]
    china_df = df.merge(meta, how='left')
    china_loc_id = 6
    #filter china and subnats
    china_df = china_df[(china_df.parent_id == china_loc_id) |
                        (china_df.location_id == china_loc_id)]
    #filter "mtwith" measure
    china_df = china_df.loc[(china_df.measure == "mtwith")]
    if not china_df.empty:
        print "china_df not empty!"
        china_df.loc[:, 'is_outlier'] = 1
        china_df.drop('parent_id', axis=1, inplace=True)

        destination_file = "{FILEPATH}"
        print destination_file
        china_df.to_excel(destination_file,
                          index=False,
                          sheet_name="extraction")
        report = run.upload_epi_data(bundle, destination_file)
        #assert nothing in the report is wrong
        assert (report['request_status'].item() == 'Successful')
Ejemplo n.º 3
0
 def upload(self, modelable_entity_id, destination_file, error_path):
     validate = run.validate_input_sheet(me_to_bundle[modelable_entity_id],
                                         destination_file, error_path)
     assert (validate['status'].item() == 'passed')
     status_df = run.upload_epi_data(me_to_bundle[modelable_entity_id],
                                     destination_file)
     assert (status_df['request_status'].item() == 'Successful')
     return status_df
Ejemplo n.º 4
0
def marketscan_data(df, bundle, cause_name):
    note = "fixes_{USERNAME}"

    # delete marketscan data and reupload with new specifications
    # subset to marketscan data
    searchfor = 'marketscan'  #all marketscan data should have marketscan in the column names
    cols = [c for c in df.columns if (searchfor in c)]
    sub_list = []
    for c in cols:
        subset = df.loc[df['{}'.format(c)] == 1, ['bundle_id', 'seq']]
        sub_list.append(subset)
    mrkt_delete = pd.concat(sub_list)

    if not mrkt_delete.empty:
        destination_file = "{FILEPATH}"
        mrkt_delete.to_excel(destination_file,
                             index=False,
                             sheet_name="extraction")
        report = run.upload_epi_data(bundle, destination_file)
        #assert nothing in the report is wrong
        assert (report['request_status'].item() == 'Successful')

    mrkt_infile = "{FILEPATH}"
    fname = "ALL_{b}_v3_{DATE}.xlsx".format(b=bundle)
    market = pd.read_excel(os.path.join(mrkt_infile, fname),
                           header=0,
                           sheet="extraction")
    #drop everything with year_start 2000
    market = market[market.year_start != 2000]
    if bundle == 610: market.loc[:, 'is_outlier'] = 1
    #outlier Hawaii
    market.loc[market.location_name == "Hawaii", 'is_outlier'] = 1

    market = cause_specifics(market, bundle)
    mrkt_destination_file = "{FILEPATH}"
    print mrkt_destination_file
    market.to_excel(mrkt_destination_file,
                    index=False,
                    sheet_name="extraction")
    report = run.upload_epi_data(bundle, mrkt_destination_file)
    #assert nothing in the report is wrong
    assert (report['request_status'].item() == 'Successful')
Ejemplo n.º 5
0
def upload_dataset(bundle_id, out_dir, status_dir):
    """
	Upload the data to the Epi-database for the given bundle ID
	"""

    # excel file to upload
    excel_file = "{out_dir}new_inputs_{bundle_id}.xlsx".format(\
                                                            out_dir=out_dir,
                     bundle_id=bundle_id)
    print bundle_id
    print excel_file

    # upload
    status = upload_epi_data(bundle_id, excel_file)

    print status

    message = ("Inputs for {bundle_id} has {result} for the upload to the "
               "Database").format(bundle_id=bundle_id,
                                  result=str(status.loc[0, 'request_status']))

    return message
Ejemplo n.º 6
0
def upload(bundle_id, destination_file, error_path):
    validate = validate_input_sheet(bundle_id, destination_file, error_path)
    assert (validate['status'].item() == 'passed')
    status_df = upload_epi_data(bundle_id, destination_file)
    assert (status_df['request_status'].item() == 'Successful')
    return status_df
Ejemplo n.º 7
0
import sys
from elmo import run
import db_queries

step, bundle, out_dir = sys.argv[1:4]
print step
print bundle
print out_dir

df = run.upload_epi_data(bundle, '%s/step_%s_input_%s.xlsx'% (out_dir, step, bundle))

Ejemplo n.º 8
0
def hospital_data(df, bundle, cause_name):
    note = "fixes_{USERNAME}"

    # delete hospital data and reupload with new specifications
    # subset to hospital data
    hosp_delete = df.loc[(df.cv_hospital == 1), ['bundle_id', 'seq']]
    destination_file = "{FILEPATH}".format(b=bundle, d=date)
    if not hosp_delete.empty:
        print destination_file
        hosp_delete.to_excel(destination_file,
                             index=False,
                             sheet_name="extraction")
        report = run.upload_epi_data(bundle, destination_file)
        #assert nothing in the report is wrong
        assert (report['request_status'].item() == 'Successful')

    hosp_infile = "{FILEPATH}"
    fname = "{b}_v6_{DATE}.xlsx".format(b=bundle)
    hospital = pd.read_excel(os.path.join(hosp_infile, fname),
                             header=0,
                             sheet="extraction")

    # change to mean_3 (mean_0*correction_factor_3)
    for var in ['mean', 'lower', 'upper']:
        hospital.loc[hospital['{}_3'.format(var)].isnull(),
                     '{}_3'.format(var)] = hospital['{}_0'.format(
                         var)] * hospital['correction_factor_3']
        hospital.rename(columns={'{}_3'.format(var): var}, inplace=True)
    # add covariates
    hospital['cv_hospital'] = 1
    hospital['cv_hosp_under1'] = 0
    hospital['cv_hosp_over1'] = 0
    hospital.loc[hospital.age_start == 0, 'cv_hosp_under1'] = 1
    hospital.loc[hospital.age_start > 0, 'cv_hosp_over1'] = 1
    #drop unneeded columns
    searchfor = ['_0', '_1', '_2']
    cols = [
        c for c in hospital.columns
        if (searchfor[0] in c) or (searchfor[1] in c) or (searchfor[2] in c)
    ]
    hospital.drop(cols, axis=1, inplace=True)
    # add note
    hospital[
        'note_modeler'] = "Hospital data version 6.3, prepped {DATE} by {USERNAME}. Used Mean 3: inpatient and outpatient, after correction for multiple visits, all diagnoses"

    #Outlier locations
    out_loc_list_1 = ["Roraima", "Turkey", "Meghalaya", "Philippines"]
    hospital.loc[hospital['location_name'].isin(out_loc_list_1),
                 "is_outlier"] = 1

    if (bundle == 602 or bundle == 604 or bundle == 620 or bundle == 624):
        out_loc_list_2 = [
            "Lithuania", "Poland", "Croatia", "Romania", "Slovakia",
            "Czech Republic", "Slovenia"
        ]
        hospital.loc[hospital['location_name'].isin(out_loc_list_2),
                     "is_outlier"] = 1

    hospital = cause_specifics(hospital, bundle)
    hosp_destination_file = "{FILEPATH}"
    print hosp_destination_file
    hospital.to_excel(hosp_destination_file,
                      index=False,
                      sheet_name="extraction")
    report = run.upload_epi_data(bundle, hosp_destination_file)
    #assert nothing in the report is wrong
    assert (report['request_status'].item() == 'Successful')
    #set etiology specific variables, change to integer from float
    bundle = mapping.loc[mapping['me'] == me, 'bundle'].values[0].astype(int)
    acause = mapping.loc[mapping['me'] == me, 'acause'].values[0]

    ##clean data
    data = data.loc[data.sample_size != 0]
    data.loc[data['mean'].isnull() == True, 'mean'] = 0
    #add outlier where mean is less than 1%
    data.loc[data['mean'] < 0.01, 'is_outlier'] = 1
    #create seq column
    data['seq'] = ""
    #try to replace source type to see if it will upload
    data['source_type'] = "Vital registration - sample"

    if delete == 1:
        ##download epi data to delete database
        print 'deleting {} bundle {} for {} upload'.format(
            acause, bundle, version)
        del_path = "FILEPATH".format(acause, bundle, version)
        epi_data = run.get_epi_data(bundle)
        epi_data = epi_data[['seq']]
        epi_data.to_excel(del_path, index=False, sheet_name='extraction')

        ##upload blank sheet
        upload = run.upload_epi_data(bundle, del_path)

    ##export data
    print 'uploading {} bundle {} {}'.format(acause, bundle, version)
    upload_path = "FILEPATH".format(acause, bundle, me, version)
    data.to_excel(upload_path, index=False, sheet_name='extraction')
    upload = run.upload_epi_data(bundle, upload_path)
Ejemplo n.º 10
0
# In[78]:

## For troubleshooting
collect.to_csv('FILEPATH')


# In[92]:

len(collect)


# In[93]:

## upload data
run.upload_epi_data(3125,write_dir +'ip_upload.xlsx')


# # outlier/ifd specifics

# In[65]:

outlier = pd.read_csv('FILEPATH')


# In[66]:

outlier.head()


# In[67]:
Ejemplo n.º 11
0
import sys
from elmo import run

chronic_csmr_bd, dismod_dir, out_dir = sys.argv[1:4]

df = run.upload_epi_data(chronic_csmr_bd,
                         '%s/epi_input_%s.xlsx' % (out_dir, chronic_csmr_bd))
Ejemplo n.º 12
0
 def upload(self, modelable_entity_id, fname):
     status_df = upload_epi_data(me_to_bundle[modelable_entity_id], fname)
     return status_df
Ejemplo n.º 13
0
## Check length one more time to make sure it looks correct
len(collect)


# In[ ]:

## Check age trend too
collect.groupby(['age_group_id', 'sex_id'], as_index = False)['mean'].mean()


# In[ ]:

## Upload data
## Should be ready!

run.upload_epi_data(3125,write_dir +'aut.xlsx')


# # Bring in China Hospital data and process

# In[ ]:

chn = pd.read_hdf('FILEPATH', key = 'df')
## already in age_groups


# In[ ]:

chn = chn[['location_id', 'year_start', 'year_end']]