def test(region_list, compare_file, compare_num): """ using a dataArray datastructure rather than a dataset This function also uses a different way to read files into xarray so that it minimizes the overhead from to_xarray that is provided in pandas """ new = True for file in region_list: print(file) temp = pd.read_hdf(file) temp = xr_conversion.df_to_xr(temp, dims = ['age_group_id', 'sex_id', 'year_id', 'cause_id', 'location_id'], wide_dim_name = 'draw') temp = temp.squeeze('location_id') if new: da = temp new = False else: da = da + temp da.coords['location_id'] = compare_num da = da.expand_dims('location_id')
def test(region_list, compare_file, compare_num): """ aggregating location files together by concatenating locations together and summing their values together along the location dimension so that all of the information is aggregated to represent the super region. Each time a file is concatenated and summed, the dataset is reindexed to include the location dimension so that new location files can be added and summed in (like sub-saharan africa). To run and compare timing, comment out the call to compare_temp """ #data = 0 #location_id = 166 #da_dot = xr.DataArray(data, coords=['location_id'], dims=['location_id']) #print("Dot Array {dot}".format(dot=da_dot)) new = True for file in region_list: print(file) temp = pd.read_hdf(file) temp = xr_conversion.df_to_xr(temp, dims = ['location_id','age_group_id', 'sex_id', 'year_id', 'cause_id'], wide_dim_name = 'draw') if new: da = temp new = False else: da = xr.concat([da, temp], dim='location_id') da = da.sum(dim='location_id') da.coords['location_id'] = compare_num
def test(region_list, compare_file, compare_num): """ using a dataArray datastructure rather than a dataset This function also uses a different way to read files into xarray so that it minimizes the overhead from to_xarray that is provided in pandas """ new = True for file in region_list: t2= time.time() temp = pd.read_hdf(file) t3 = time.time() print("Time to read in an hdf5 file to pandas {time}".format(time=t3-t2)) t4=time.time() temp = xr_conversion.df_to_xr(temp, dims = ['age_group_id', 'sex_id', 'year_id', 'cause_id', 'location_id'], wide_dim_name = 'draw') t5 = time.time() print("Time to convert from pandas to Xarray dataset {time}".format(time = t5-t4)) print("Total upload time is {time}".format(time=t5-t2)) temp = temp.squeeze('location_id') print(file) if new: da = temp new = False else: da = da + temp da.coords['location_id'] = compare_num da = da.expand_dims('location_id')
def read_hdf_to_netcdf(hdf_list): for file in hdf_list: print(file) df = pd.read_hdf(file) da = xr_conversion.df_to_xr(df, dims = ['location_id', 'age_group_id', 'cause_id','sex_id','year_id'], wide_dim_name='draw') split = file.split(".") netcdf_name = split[0]+".nc" da.to_netcdf(netcdf_name)
def read_to_data_array(filelist): netcdf_list = [] for file in filelist: t2=time.time() df = pd.read_hdf(file) da = xr_conversion.df_to_xr(df, dims = ['age_group_id', 'sex_id', 'year_id', 'cause_id', 'location_id'], wide_dim_name = 'draws') print(file) split = file.split(".") netcdf_name = split[0]+".nc" da.to_netcdf(netcdf_name) t3=time.time() netcdf_list.append(netcdf_name) print("Time to read and save to netcdf: {time}".format(time=t3-t2))
def compare_temp(aggregated_da, comp_file): """ compares the aggregated values to the sub-saharan country file to make sure that the locations were aggregated correctly and the values match. """ print("In temp_compare function") #read in the compare file to an xarray dataset comp_df = pd.read_hdf(comp_file) comp_da = xr_conversion.df_to_xr(comp_df, dims = ['location_id','age_group_id', 'sex_id', 'year_id', 'cause_id'], wide_dim_name = 'draw') #compare = np.isclose(comp_da, aggregated_da, rtol=1e-05, atol=1e-08, equal_nan=True) #print("COMPARE VALUE {comp}".format(comp=compare)) if comp_da.equals(aggregated_da): print("EQUAL") else: print("NOT EQUAL") #for el in comp_da: #print(comp_da.size()) #print(el) #print(el.sizes()) #xr.testing.assert_allclose(aggregated_da, comp_da, rtol=1e-02, atol=1e-04) #print("done comparing") print("Next equality test:") good_coords = [ val for val in aggregated_da.coords['location_id'].values if val in comp_da.coords['location_id'].values] matched = comp_da.loc[{'location_id': good_coords}] print("GOOD COORD VALS: {good}".format(good=good_coords)) print("MATCHED VALS: {matched}".format(matched=matched)) print("Final test") check = True print("AGG {agg}".format(agg=aggregated_da)) print("COMP {comp}".format(comp=comp_da)) draw_vals = comp_da.coords['draw'].values print("DRAW_VALS {draw}".format(draw=draw_vals)) for el in draw_vals: print("HERE") print("VALUE {el}".format(el=el)) #print(el.length()) #print("Size {len}".format(len = el.size())) comp_val = comp_da.__getitem__(el) agg_val = aggregated_da.__getitem__(el) #print(comp_val) #print(agg_val) #compares the calculated values for equality within a margin and counts nan values and 0 as equal compare = np.isclose(comp_val, agg_val, rtol=1e-05, atol=1e-08, equal_nan=True) #if it is not equal if compare.all() is False: print("not the same") check = False print("COMP {comp}".format(comp=comp_val)) print("AGG {agg}".format(agg=agg_val)) if check is True: print("ALL EQUAL!!!") else: print("NOT all equal")
def test(region_list, compare_file, compare_num): """ aggregating location files together by concatenating locations together and summing their values together along the location dimension so that all of the information is aggregated to represent the super region. Each time a file is concatenated and summed, the dataset is reindexed to include the location dimension so that new location files can be added and summed in (like sub-saharan africa). To run and compare timing, comment out the call to compare_temp """ #data = 0 #location_id = 166 #da_dot = xr.DataArray(data, coords=['location_id'], dims=['location_id']) #print("Dot Array {dot}".format(dot=da_dot)) new = True for file in region_list: print(file) temp = pd.read_hdf(file) temp = xr_conversion.df_to_xr(temp, dims = ['location_id','age_group_id', 'sex_id', 'year_id', 'cause_id'], wide_dim_name = 'draw') #print(temp.sizes) #print("COORDS {stuff}".format(stuff=temp.coords['location_id'])) # temp.coords['location_id'] = compare_num #print("AFTER: {coords}".format(coords=temp.coords['location_id'])) #print(temp.sizes) #print("ATTRIBUTES: {att}".format(att=temp.attrs)) #save = temp.reset_index(dims_or_levels = ['age','sex','year','cause','loc']) #print(save) #da_dot = temp['location_id'] #da_dot.coords['location_id'] = 1 #print("DA_DOT {dot}".format(dot=da_dot)) #print(temp.sizes) #print(da_dot.sizes) #print("DATA {data}".format(data=temp.data)) #print("LOCATION {loc}".format(loc = temp['location_id'])) #print("DOT ARRAY {dot}".format(dot=da_dot)) if new == True: da = temp new = False """ if file == '': compare_num=temp['location_id']-1 temp.coords['location_id'] = compare_num print("AFTER: {coords}".format(coords=temp.coords['location_id'])) print(temp.sizes) print("ATTRIBUTES: {att}".format(att=temp.attrs)) da.coords['location_id'] = compare_num """ #print("DA {size}".format(size = da.sizes)) else: #dot_da = da['location_id'].vals #print("DOT ARRAY {dot}".format(dot=dot_da)) #dot_da = da['location_id']#.squeeze('location_id') #dot_da.coords['location_id'] = 1 #print(dot_da) #print(dot_da.sizes) #print(dot_da.dims) #da_dot=da['location_id'] #temp=temp.squeeze('location_id') #temp.coords['location_id'] = compare_num da = xr.concat([da, temp], dim='location_id') da = da.sum(dim='location_id') """ print("NEXT") print(temp.dims) print(da.dims) #temp['location_id']=1 good_coords = [ val for val in da.coords['location_id'].values if val in temp.coords['location_id'].values] print(good_coords) matched = temp.loc[{'location_id': good_coords}] print("MATCHED: {match}".format(match = matched)) da=da.groupby(matched).sum() print(da) """ #da = xr.concat([da, temp], dim='location_id') #da = da.sum('location_id') #da = da.groupby['location_id'].sum()#dim = 'location_id') #da = da.dot(temp) #print(temp.dims) #print(da) #da.reindex_like(save) #for dim in save.dims: #da=da.expand_dims(dim) #da = da.dot(dot_da) #dot_da = temp['age_group_id'] #da = da.dot(dot_da) #print("After dot product {da}".format(da=da)) da.coords['location_id'] = compare_num