def extract_entities_groups(regs, gps): """extract all country groups entities regs: regions.json, contains country/region name and gwid. gps: area_categorizarion.json, contains groups name and group levels returns a dictionary which keys are group name and values are dataframes. """ res = {} regd = {} # a dictionary, which keys are region id and values are region names for i in regs: regd[i.get(list(i.keys())[0])] = list(i.keys())[0] for i, n in gps.n.apply(to_concept_id).iteritems(): df = pd.DataFrame([], columns=[n, 'name', 'gwid', 'is--'+n]) df['gwid'] = gps.iloc[i]['groupings'].keys() if i == 4: df[n] = df['gwid'].apply(lambda x: to_concept_id(regd[x], sep='')) else: df[n] = df['gwid'].apply(lambda x: to_concept_id(regd[x])) df['name'] = df['gwid'].apply(lambda x: regd[x]) df['is--'+n] = 'TRUE' res[n] = df return res
def map_to_id(x): if x == str("Stockholms län"): return to_concept_id("01 " + x) elif x == "Riket": return to_concept_id("swe") else: return to_concept_id(x)
def extract_datapoints(data): conc = 'Annual number of hours actually worked per person' dps = data[['Country (code)', 'Year', conc]].copy() dps.columns = ['country', 'year', to_concept_id(conc)] dps['country'] = dps['country'].map(to_concept_id) return to_concept_id(conc), dps.dropna()
def _read_data(data_file): with open(data_file) as f: data = etree.parse(f) root = data.getroot() # get all namespaces from the xml nsmap = root.nsmap.copy() # change None to a meaningful name, so that I can use later. nsmap['xmlns'] = nsmap.pop(None) all_data = {} all_ser = root.xpath('.//xmlns:Series', namespaces=nsmap) for item in all_ser: # each series tag contains a time series for a given indicator # and country. here we loop though all series and group them into the # all_data dict, where keys are indicators and values are a dict of # {location: series} for that indicator. item_dict = xmltodict.parse(etree.tostring(item)) attrs = {} ser = [] # getting series attributes: location and indicator id for i in item_dict['Series']['SeriesKey']['Value']: if i['@concept'] == 'EDULIT_IND': attrs['key'] = to_concept_id(i['@value']) if i['@concept'] == 'LOCATION': attrs['location'] = to_concept_id(i['@value']) # get observation data. obs = item_dict['Series']['Obs'] if isinstance(obs, list): for o in item_dict['Series']['Obs']: ser.append([o['Time'], o['ObsValue']['@value']]) else: ser.append([obs['Time'], obs['ObsValue']['@value']]) if attrs['key'] not in all_data.keys(): all_data[attrs['key']] = {attrs['location']: ser} else: # there should be no duplicates in locations assert attrs['location'] not in all_data[attrs['key']].keys() all_data[attrs['key']][attrs['location']] = ser # concat the list of series for each indicator for k, v in all_data.items(): to_concat = [] for loc, ser in v.items(): df = pd.DataFrame(ser, columns=['time', k]) df['location'] = loc to_concat.append(df) all_data[k] = pd.concat(to_concat, ignore_index=True) return all_data
def extract_datapoints(data): for k, df in data.items(): df.columns = list(map(to_concept_id, df.columns)) df['country'] = df['country'].map(to_concept_id) df = df.set_index(['country', 'year']) df = df.stack().reset_index() df['year'] = df['year'].map(int) df.columns = ['country', 'year', 'sex', to_concept_id(k)] yield to_concept_id(k), df
def serve_datapoints_return_measures(data_full: pd.DataFrame, measure: dict, metric: dict): all_measures = [] groups = data_full.groupby(by=['measure', 'metric']) for g in groups.groups: name = measure[g[0]] + ' ' + metric[g[1]] # print(name) concept = to_concept_id(name) all_measures.append((concept, name)) df = groups.get_group(g) df = df.rename(columns={'val': concept}) cause_groups = df.groupby(by='cause') # split by cause cols = ['location', 'sex', 'age', 'cause', 'year', concept] df[concept] = df[concept].map(formatter) # if concept == 'mmr_rate': # print(df.sex.unique()) for g_ in cause_groups.groups: df_ = cause_groups.get_group(g_) # print(g_) # print(df_.age.unique()) # print(len(df_.year.unique())) # print(len(df_.location.unique())) cause = 'cause-{}'.format(g_) by = ['location', 'sex', 'age', cause, 'year'] file_name = 'ddf--datapoints--' + concept + '--by--' + '--'.join( by) + '.csv' file_path = osp.join(output_dir, file_name) df_[cols].sort_values( by=['location', 'sex', 'age', 'year']).to_csv(file_path, index=False) return all_measures
def extract_datapoints(data): conc = [ "Labour force ('000)", "Population ('000)", "Labour force participation rate (%)" ] dps = data[[ 'Country (code)', 'Sex (code)', 'Age group (code)', 'Year', *conc ]].copy() dps.columns = [ 'country', 'sex', 'age_group', 'year', *[to_concept_id(_rename_concept(x)) for x in conc] ] dps['country'] = dps['country'].map(to_concept_id) dps['sex'] = dps['sex'].map(to_concept_id) dps['age_group'] = dps['age_group'].str.replace('+', '_plus').map(to_concept_id) dps = dps.set_index(['country', 'sex', 'age_group', 'year']) for k, df in dps.items(): df_ = df.reset_index().dropna() yield k, df_
def main(): data = pd.read_excel('../source/injury_mortality_trend_tables.xls', sheet_name="Rates", skiprows=4) # manually set the cause entity domain name->concept mapping m = { 'Road traffic accidents': 'traffic', 'Homicide': 'homicide', 'Self-inflicted injuries': 'suicide' } data = data.rename(columns={'cause of death': 'cause'}) data['cause'] = data['cause'].map(m) # datapoints dps = data.copy() dps = dps.drop(['ICD', 'name'], axis=1) dps = dps.set_index(['country', 'year', 'cause', 'sex']) cols = dps.columns for c in dps: c_ = to_concept_id(c) df = dps[c].copy() df.name = c_ df.reset_index().dropna().to_csv( f"../../ddf--datapoints--{c_}--by--country--year--cause--sex.csv", index=False) # country country = data[['country', 'name']].drop_duplicates(subset=['country'], keep='first') country.to_csv('../../ddf--entities--country.csv', index=False) # sex sex = pd.DataFrame([[0, 'Both sexes'], [1, 'male'], [2, 'female']], columns=['sex', 'name']) sex.to_csv('../../ddf--entities--sex.csv', index=False) # cause cause = pd.DataFrame.from_dict(m, orient='index').reset_index() cause.columns = ['name', 'cause'] cause.to_csv('../../ddf--entities--cause.csv', index=False) # concepts cont = cols.map(to_concept_id) cont_df = pd.DataFrame.from_dict({'concept': cont, 'name': cols}) cont_df['concept_type'] = 'measure' ent_df = pd.DataFrame( [['country', 'Country'], ['sex', 'Sex'], ['cause', 'Cause']], columns=['concept', 'name']) ent_df['concept_type'] = 'entity_domain' other_df = pd.DataFrame( [['year', 'Year', 'time'], ['name', 'Name', 'string']], columns=['concept', 'name', 'concept_type']) concepts_df = pd.concat([cont_df, ent_df, other_df], sort=False) concepts_df.to_csv('../../ddf--concepts.csv', index=False)
def extract_entities_country(regs, geo, gps, geo_sg, geo_map=False): """if geo_map is True, return a geomap which maps the old country id to new id else return the country entities with new id. regs: regions.json, contains country/region name and gwid. geo: country_synonyms.xlsx, contains all country info gps: area_categorizarion.json, contains groups name and group levels geo_sg: country entities from systema_globalis """ regd = {} for i in regs: regd[i.get(list(i.keys())[0])] = list(i.keys())[0] geo_ = geo[['ISO3dig_ext', 'Gwid']] geo_ = geo_.set_index('Gwid') geo_2 = geo.set_index('Gwid').drop('ISO3dig_ext', axis=1) country = geo_.copy() # loop though all groupings, build a dataframe which gwid is the index and # group names are columns. for i, n in gps.n.apply(to_concept_id).iteritems(): res = {} for k, v in gps.iloc[i]['groupings'].items(): for gwid in v: if gwid: res[gwid] = to_concept_id(regd[k]) ser = pd.Series(res) country[n] = ser # combine the groupings info and other info, and do some cleanups. country2 = pd.concat([country, geo_2], axis=1) country2 = country2.reset_index() country2 = country2.rename(columns={'NAME': 'Upper Case Name', 'Use Name': 'Name', 'ISO3dig_ext': 'country_2'}) country2.columns = list(map(to_concept_id, country2.columns)) country2['is--country'] = 'TRUE' # adding world_4region data country3 = geo_sg[['geo', 'world_4region', 'latitude', 'longitude', 'name']] country3 = country3.rename(columns={'geo': 'country'}).set_index('name') # the final dataframe country4 = pd.concat([country2.set_index('name'), country3], axis=1) country4 = country4.reset_index() country4 = country4.rename(columns={'index': 'name'}) if not geo_map: country4 = country4.drop('country_2', axis=1) cols = country4.columns.drop(['country', 'gwid', 'name']) ex_col = np.r_[['country', 'gwid', 'name'], cols] return country4.loc[:, ex_col] else: country4 = country4.set_index('country_2') return country4['country']
def concept_id(obj, renames={}, dict_value=True, dict_key=True): if isinstance(obj, list): return [concept_id(x, renames) for x in obj] if isinstance(obj, dict): return { concept_id(key, renames) if dict_key else key : concept_id(value, renames) if dict_value else value for (key,value) in obj.items() } return to_concept_id(underscore(rename(obj, renames)))
def extract_datapoints(all_data): for df in all_data: for g, ids in df.groupby('Variable Name').groups.items(): df_concept = df.ix[ids].copy() concept = to_concept_id(g) df_concept['area'] = df['Area'].map(to_concept_id) df_concept = df_concept.rename(columns={'Value': concept, 'Year': 'year'}) df_yield = df_concept[['area', 'year', concept]].copy() yield concept, df_yield.drop_duplicates()
def main(): md = ihme.load_metadata() metric = md['metric'].copy() measure = md['measure'].copy() # datapoints datapoint_output_dir = osp.join(output_dir, 'deaths') os.makedirs(datapoint_output_dir, exist_ok=True) data_full = dd.from_delayed([dask.delayed(load_data)(f) for f in os.listdir(source_dir) if f.endswith('.zip')], meta=DTYPES) metric = metric.set_index('id')['name'].to_dict() measure = measure.set_index('id')['short_name'].to_dict() all_measures = list() measure_metric_combinations = product(MEASURES, METRICS) for g in measure_metric_combinations: name = measure[g[0]] + ' ' + metric[g[1]] print(f'creating dattpoints for {name}') concept = to_concept_id(name) all_measures.append((concept, name)) cols = ['location', 'sex', 'age', 'cause', 'year', 'val'] df = data_full.loc[(data_full.measure == g[0]) & (data_full.metric == g[1]), cols].compute() serve_datapoint(df, concept) # entities serve_entities(md) # concepts cont_cdf = pd.DataFrame(all_measures, columns=['concept', 'name']) cont_cdf['concept_type'] = 'measure' cont_cdf.to_csv('../../ddf--concepts--continuous.csv', index=False) dis_cdf = pd.DataFrame([ ['name', 'Name', 'string'], ['short_name', 'Short Name', 'string'], ['medium_name', 'Medium Name', 'string'], ['long_name', 'Long Name', 'string'], ['location', 'Location', 'entity_domain'], ['sex', 'Sex', 'entity_domain'], ['age', 'Age', 'entity_domain'], ['cause', 'Cause', 'entity_domain'], ['rei', 'Risk/Etiology/Impairment', 'entity_domain'], ['label', 'Label', 'string'], ['year', 'Year', 'time'], ['type', 'Type', 'string'] ], columns=['concept', 'name', 'concept_type']) dis_cdf.sort_values(by='concept').to_csv('../../ddf--concepts--discrete.csv', index=False) print("Done.")
def extract_datapoints(all_data): for df in all_data: for g, ids in df.groupby('Variable Name').groups.items(): df_concept = df.ix[ids].copy() concept = to_concept_id(g) df_concept['area'] = df['Area'].map(to_concept_id) df_concept = df_concept.rename(columns={ 'Value': concept, 'Year': 'year' }) df_yield = df_concept[['area', 'year', concept]].copy() yield concept, df_yield.drop_duplicates()
def extract_concepts(data): all_concepts = [x for x in data.keys()] all_concept_ids = [to_concept_id(x) for x in all_concepts] concepts = pd.DataFrame([], columns=['concept', 'name', 'concept_type']) concepts['concept'] = ['name', 'year', 'sex', 'country', *all_concept_ids] concepts['name'] = ['Name', 'Year', 'Sex', 'Country', *all_concepts] concepts['concept_type'] = 'measure' concepts.iloc[0]['concept_type'] = 'string' concepts.iloc[1]['concept_type'] = 'time' concepts.iloc[2]['concept_type'] = 'entity_domain' concepts.iloc[3]['concept_type'] = 'entity_domain' return concepts
def extract_datapoints(data): dps = data[['cname', 'Time Period', 'Area ID', 'Data Value']].copy() dps.columns = ['concept', 'year', 'area', 'data'] dps['area'] = dps['area'].map(to_concept_id) dps['concept'] = dps['concept'].map(lambda x: to_concept_id(x, '[/ -\\.\\*";:]+')) dps_gps = dps.groupby(by='concept') for k, idx in dps_gps.groups.items(): df = dps.ix[idx][['year', 'area', 'data']].copy() df.columns = ['year', 'area', k] # assert(np.all(df[['year', 'area']].duplicated()) == False) df = df.sort_values(by=['area', 'year']) yield k, df
def extract_concepts(data): discs = ['Name', 'Year', 'Country', 'Sex'] conc = data.columns[12:28] cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type']) cdf['name'] = [*discs, *conc] cdf['concept'] = cdf['name'].map(lambda x: to_concept_id(_rename_concept(x))) cdf.loc[4:, 'concept_type'] = 'measure' cdf.loc[0, 'concept_type'] = 'string' cdf.loc[1, 'concept_type'] = 'time' cdf.loc[2:3, 'concept_type'] = 'entity_domain' return cdf
def extract_concepts(data): discs = ['Name', 'Year', 'Country'] conc = data.columns[8:12] cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type']) cdf['name'] = [*discs, *conc] cdf['concept'] = cdf['name'].map(lambda x: to_concept_id(_rename_concept(x))) cdf.loc[4:, 'concept_type'] = 'measure' cdf.loc[0, 'concept_type'] = 'string' cdf.loc[1, 'concept_type'] = 'time' cdf.loc[2:3, 'concept_type'] = 'entity_domain' return cdf
def extract_datapoints(data): conc = data.columns[8:16] dps = data[['Country (code)', 'Year', *conc]].copy() dps.columns = ['country', 'year', *[to_concept_id(_rename_concept(x)) for x in conc]] dps['country'] = dps['country'].map(to_concept_id) dps = dps.set_index(['country', 'year']) for k, df in dps.items(): df_ = df.reset_index().dropna() yield k, df_
def extract_datapoints(data): conc = data.columns[8:12] dps = data[['Country (code)', 'Year', *conc]].copy() dps.columns = ['country', 'year', *[to_concept_id(_rename_concept(x)) for x in conc]] dps['country'] = dps['country'].map(to_concept_id) dps = dps.set_index(['country', 'year']) for k, df in dps.items(): df_ = df.reset_index().dropna() yield k, df_
def extract_concepts(data): conc = data[['cname', 'Unit']].copy() conc = conc.drop_duplicates() conc.columns = ['name', 'unit'] conc['concept_type'] = 'measure' conc['concept'] = conc['name'].map(lambda x: to_concept_id(x, '[/ -\\.\\*";:]+')) # manually create discrete concepts disc = pd.DataFrame([['Name', np.nan, 'string', 'name'], ['Year', 'year', 'time', 'year'], ['Area', np.nan, 'entity_domain', 'area'], ['Unit', np.nan, 'string', 'unit']], columns=conc.columns) concept = pd.concat([disc, conc]) return concept[['concept', 'name', 'concept_type', 'unit']]
def extract_datapoints_country_year(data): """extract datapoints for each concept by country and year""" # first, construct a dict that contains all metrics as key and a list of # columns related to a metric as value of a key. # we will later pass the dict to data.loc[: col[key]] to get all data # point for a metric. metrics = [] for i in data.columns[3:]: s = i[:-5] if s not in metrics: metrics.append(s) col = {} for m in metrics: col[m] = list(filter(lambda x: x.startswith(m), data.columns)) # now we loop through each metrics and create data frame. res = {} for m in metrics: col_metric = np.r_[data.columns[:3], col[m]] # change the column form metirc.year to year col_metric_new = list(map(lambda x: x[-4:], col[m])) col_metric_new = np.r_[data.columns[:3], col_metric_new] data_metric = data[col_metric].copy() data_metric.columns = col_metric_new gs = data_metric.groupby(by='Uncertainty bounds*').groups for p in ['Lower', 'Median', 'Upper']: name = to_concept_id(m+'.'+p) headers = ['country', 'year', name] data_bound = data_metric.ix[gs[p]] data_bound = data_bound.set_index('ISO Code') data_bound = data_bound.T['1950':] # the data from source start from 1950 data_bound = data_bound.unstack().reset_index().dropna() data_bound.columns = headers data_bound['country'] = data_bound['country'].map(to_concept_id) res[name] = data_bound return res
def extract_datapoints(data): conc = data.columns[14:19] dps = data[['Country (code)', 'Sex (code)', 'Age group (code)', 'Year', *conc]].copy() dps.columns = ['country', 'sex', 'age_group', 'year', *[to_concept_id(_rename_concept(x)) for x in conc]] dps['country'] = dps['country'].map(to_concept_id) dps['sex'] = dps['sex'].map(to_concept_id) dps['age_group'] = dps['age_group'].str.replace('+', '_plus').map(to_concept_id) dps = dps.set_index(['country', 'sex', 'age_group', 'year']) for k, df in dps.items(): df_ = df.reset_index().dropna() yield k, df_
def process_source_files(): """create datapoints from source files and return all concepts""" indi_list = [] indi_desc_list = [] indi = load_indicator_list() print('creating datapoint files...') # import ipdb; ipdb.set_trace() for i in indi['GHO']['Metadata']['Dimension']['Code']: path = os.path.join(source_dir, i['@Label']+'.csv') concept = to_concept_id(i['@Label']) if os.path.exists(path): try: df = pd.read_csv(path) except FileNotFoundError: print(f'{path} not found') continue except pd.errors.EmptyDataError: print(f'{path} has no data') continue result, reason = can_proceed(df) if result is False: print(f'{concept} skipped: {reason}') continue else: create_datapoint(df, concept) indi_list.append(concept) indi_desc_list.append(i['Display']) print('creating concept file...') conc = pd.DataFrame([], columns=['concept', 'concept_type', 'name']) conc['concept'] = indi_list conc['name'] = indi_desc_list conc['concept_type'] = 'measure' conc = conc.append(pd.DataFrame([['name', 'string', 'Name'], ['year', 'time', 'Year'], ['country', 'entity_domain', 'Country']], columns=conc.columns)) conc_path = os.path.join(out_dir, 'ddf--concepts.csv') conc.sort_values(by='concept').to_csv(conc_path, index=False)
def extract_concepts(data): discs = ['Name', 'Year', 'Country', 'Sex', 'Age Group'] conc = [ "Labour force ('000)", "Population ('000)", "Labour force participation rate (%)" ] cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type']) cdf['name'] = [*discs, *conc] cdf['concept'] = cdf['name'].map( lambda x: to_concept_id(_rename_concept(x))) cdf.loc[5:, 'concept_type'] = 'measure' cdf.loc[0, 'concept_type'] = 'string' cdf.loc[1, 'concept_type'] = 'time' cdf.loc[2:4, 'concept_type'] = 'entity_domain' return cdf
def create_datapoints(data, indicator_mapping, method_mapping): data_df = data[[ 'Country or area', 'Indicator', 'Median estimate and uncertainty intervals', 'DataValue', 'Year' ]] data_df = data_df[data_df['Median estimate and uncertainty intervals'] == 'MEDIAN ESTIMATE (adjusted)'] data_df = data_df[['Country or area', 'Year', 'Indicator', 'DataValue']].copy() data_df['concept'] = data_df['Indicator'].map( lambda x: indicator_mapping[x]['indicator']) data_df['method'] = data_df['Indicator'].map( lambda x: indicator_mapping[x]['method']) data_df.columns = ['country', 'year', 'i', 'val', 'concept', 'method'] data_df = data_df[['country', 'year', 'concept', 'method', 'val']] gs = data_df.groupby('concept') for c, df_ in gs: c_id = to_concept_id(c) df = df_.copy() df = df.drop('concept', axis=1) df['country'] = df['country'].map(to_concept_id) df['method'] = df['method'].map(method_mapping) df.columns = ['country', 'year', 'method', c_id] df = df[['country', 'method', 'year', c_id]] if df['method'].dropna().empty: df = df.drop('method', axis=1) df.to_csv( '../../ddf--datapoints--{}--by--country--year.csv'.format( c_id), index=False) else: df.to_csv( '../../ddf--datapoints--{}--by--country--method--year.csv'. format(c_id), index=False)
area = data001['area'].copy() area_id = area.map(to_concept_id) area_df = pd.DataFrame([], columns=['area', 'name']) area_df['area'] = area_id area_df['name'] = area path = os.path.join(out_dir, 'ddf--entities--area.csv') area_df.to_csv(path, index=False) # datapoints dp = data001.set_index('area') dp = dp.T.unstack() dp = dp.reset_index() dp.columns = ['area', 'year', to_concept_id('GDP per capita')] dp['area'] = dp['area'].map(to_concept_id) path = os.path.join(out_dir, 'ddf--datapoints--gdp_per_capita--by--area--year.csv') dp.dropna().to_csv(path, index=False) # concepts conc = ['gdp_per_capita', 'area', 'year', 'name'] cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type']) cdf['concept'] = conc cdf['name'] = ['GDP per capita', 'Area', 'Year', 'Name'] cdf['concept_type'] = ['measure', 'entity_domain', 'time', 'string'] path = os.path.join(out_dir, 'ddf--concepts.csv') cdf.to_csv(path, index=False)
# entities area = data001['Area'].unique() area_id = list(map(to_concept_id, area)) ent = pd.DataFrame([], columns=['area', 'name']) ent['area'] = area_id ent['name'] = area path = os.path.join(out_dir, 'ddf--entities--area.csv') ent.to_csv(path, index=False) # datapoints dps = { 'Total Fertility Rate (TFR), also called Children per Woman': 'total_fertility_rate', 'TFR interpolated': 'total_fertility_rate_interpolated', 'Crude Birth Rate (CBR)': 'crude_birth_rate', 'Princeton If index': to_concept_id('Princeton If index') } for col, col_id in dps.items(): dp = extract_datapoints(data001, col, col_id) path = os.path.join(out_dir, 'ddf--datapoints--{}--by--area--year.csv'.format(col_id)) dp.dropna().sort_values(by=['area', 'year']).to_csv(path, index=False) # data001_dp_1 = data001[['Area', 'Year', 'Total Fertility Rate (TFR), also called Children per Woman']].copy() # data001_dp_2 = data001[['Area', 'Year', 'TFR interpolated']].copy() # data001_dp_1.columns = ['area', 'year', 'total_fertility_rate'] # data001_dp_2.columns = ['area', 'year', 'total_fertility_rate_interpolated'] # data001_dp_1['area'] = data001_dp_1['area'].map(to_concept_id)
def test_to_concept_id(s): from ddf_utils.str import to_concept_id res = to_concept_id(s) if res: assert re.match(r'[0-9a-z_]*', res)
def main(): country_df = pd.read_csv('../../ddf--entities--geo--country.csv', dtype=str) synonyms_dict = pd.read_csv('../../ddf--synonyms--geo.csv', dtype=str).set_index('synonym').geo.to_dict() r = requests.get(UN_SDG_URL) regions_tree = r.json() regions_flat = flatten(regions_tree[0]) # 0 = world # Least Developed Countries ldc_entities = [ { 'un_sdg_ldc': 'un_least_developed', 'name': regions_tree[1]['geoAreaName'], 'is--un_sdg_ldc': 'TRUE' },{ 'un_sdg_ldc': 'un_not_least_developed', 'name': 'Other UN Countries', 'is--un_sdg_ldc': 'TRUE' }] ldc_countries = [] ldc_set = set() for country in regions_tree[1]['children']: code = str(country['geoAreaCode']) if code in synonyms_dict: ldc_countries.append({ 'country': synonyms_dict[code], 'un_sdg_ldc': 'un_least_developed' }) ldc_set.add(code) else: print('Could not find synonym for ', country) for country in regions_flat[1]['children']: code = str(country['geoAreaCode']) if code in synonyms_dict and code not in ldc_set: ldc_countries.append({ 'country': synonyms_dict[code], 'un_sdg_ldc': 'un_not_least_developed' }) # SDG Regions regions = [] region_countries = [] for region_id, subregions in region_composition.items(): region_name = regions_flat[region_id]['geoAreaName'] region_entity_id = 'un_' + to_concept_id(region_name) region = { 'un_sdg_region': region_entity_id, 'name': region_name, 'color': '#' + region_color[region_id], 'is--un_sdg_region': 'TRUE' } regions.append(region) subregions.append(region_id) for subregion in subregions: for country in regions_flat[subregion]['children']: code = str(country['geoAreaCode']) if code in synonyms_dict: region_countries.append({ 'country': synonyms_dict[code], 'un_sdg_region': region_entity_id }) else: print('Could not find synonym for ', country) # un sdg region entity set regions_df = pd.DataFrame.from_records(regions) regions_df.to_csv('../../ddf--entities--geo--un_sdg_region.csv', index=False) # un sdg ldc entity set ldc_df = pd.DataFrame.from_records(ldc_entities) ldc_df.to_csv('../../ddf--entities--geo--un_sdg_ldc.csv', index=False) # update country properties country_df = country_df.set_index('country') region_countries = pd.DataFrame.from_records(region_countries).set_index('country') ldc_countries = pd.DataFrame.from_records(ldc_countries).set_index('country') country_df['un_sdg_region'] = region_countries.reindex(country_df.index)['un_sdg_region'] country_df['un_sdg_ldc'] = ldc_countries.reindex(country_df.index)['un_sdg_ldc'] country_df.to_csv('../../ddf--entities--geo--country.csv')
def concept_id(str): return to_concept_id(underscore(str))
def process_file(zf, f, domains, flag_cat, geos): """process a file in zf, create datapoints files and return all concepts""" concs = [] # file_contents = zf.read(f) tmpfile = mktemp() with open(tmpfile, 'wb') as tf: with zf.open(f) as z: # print(tmpfile) tf.write(z.read()) tf.flush() # load the actual csv from the zipped file. zf2 = zipfile.ZipFile(tmpfile) fn_data_csv = guess_data_filename(zf2) data_csv = BytesIO(zf2.read(fn_data_csv)) df = pd.read_csv(data_csv, encoding='latin1', dtype=DEFAULT_DTYPES) def starts_with_char(x): return re.match('[a-zA-Z].*', x) def is_good_length(x): return len(x) < 80 def add_domain(x): return ' '.join([domains[f], x]) if 'Element' in df.columns: groups = df.groupby(['Item Code', 'Element Code']) else: groups = df.groupby('Item Code') for g, df_g in groups: if 'Area Code' in df.columns: country_col = 'Area Code' elif 'Country Code' in df.columns: country_col = 'Country Code' elif 'CountryCode' in df.columns: country_col = 'CountryCode' else: print("Error: column layout not supportted") raise KeyError(df.columns) df_ = df_g[[country_col, 'Year', 'Value', 'Unit', 'Flag']].copy() item_name = df_g.iloc[0]['Item'] if isinstance(g, tuple): # groupby item code and element code element_name = df_g.iloc[0]['Element'] item_code = str(g[0]) if starts_with_char(item_code) and is_good_length(item_code): concept_fullname = ' - '.join([item_code, element_name]) elif is_good_length(item_name): concept_fullname = ' - '.join([item_name, element_name]) else: concept_fullname = ' - '.join([item_code, element_name]) indicator = ' - '.join([item_name, element_name]) else: # only group by item code item_code = str(g) if starts_with_char(item_code) and is_good_length(item_code): concept_fullname = item_code elif is_good_length(item_name): concept_fullname = item_name else: concept_fullname = item_code indicator = item_name concept_id = to_concept_id(add_domain(concept_fullname)) df_.columns = ['geo', 'year', concept_id, 'unit', 'flag'] df_ = df_.dropna(subset=[concept_id]) # don't include geos not in geo domain df_ = df_[df_['geo'].isin(geos)] if df_.empty: # no content continue if len(df_['unit'].unique()) > 1: print('unit not unique:', concept_id, df_['unit'].unique()) continue # don't proceed these indicators unit = df_['unit'].unique()[0] concs.append({'name': indicator, 'concept': concept_id, 'unit': unit}) df_['flag'] = df_['flag'].fillna('_') df_['flag'] = df_['flag'].astype(flag_cat) df_ = df_.sort_values(by='flag').drop_duplicates( subset=['geo', 'year'], keep='first') if df_[df_.duplicated(subset=['geo', 'year'])].shape[0] > 0: print('duplicated found in {}'.format(concept_id)) df_ = df_[['geo', 'year', concept_id]] try: df_[concept_id] = df_[concept_id].map(format_float_digits) except decimal.InvalidOperation: print(f"{concept_id} values seems not decimals") df_['geo'] = df_['geo'].astype(str) os.makedirs(osp.join(out_dir, 'datapoints', domains[f]), exist_ok=True) (df_.sort_values(by=['geo', 'year']).to_csv(osp.join( out_dir, 'datapoints', domains[f], 'ddf--datapoints--{}--by--geo--year.csv'.format(concept_id)), index=False)) # finally remove the temp file del (zf2) os.remove(tmpfile) return concs
def main(): entities = {} for f in os.listdir('../source'): if not f.endswith('.csv'): continue name = f[:-4] concept = name.lower() print(concept) df = read_source(name) if df is None: print("\tno data") continue key_columns = get_key_columns(df) df = check_source(df, key_columns) # rename column names to their ddf concept id column_map = {'TimePeriod': 'year', 'GeoAreaCode': 'geo_area'} for c in key_columns[2:]: column_map[c] = to_concept_id(c) column_map['Value'] = concept df_ = df[[*key_columns, 'Value']].copy() df_.columns = [column_map[c] for c in df_.columns] # entities if len(key_columns) > 2: for c in df_.columns[2:-1]: if c in entities: entities[c] = entities[c].append( create_entity(c, df_[c].unique())) else: entities[c] = create_entity(c, df_[c].unique()) # convert key columns into concept IDs df_[c] = df_[c].map(to_concept_id) serve_datapoints(df_, concept) # entities serve_entities(entities) # geo entity, from the api gdf = create_geo_entity() gdf = sort_df(gdf, 'geo_area') gdf.to_csv('../../ddf--entities--geo_area.csv', index=False) # concepts cdf = create_measure_concepts() cdf = sort_df(cdf, 'concept') cdf.to_csv('../../ddf--concepts--continuous.csv', index=False) cdf2 = pd.DataFrame({'concept': list(entities.keys())}) cdf2['concept_type'] = 'entity_domain' cdf2['name'] = cdf2['concept'].map(lambda x: x.replace('_', ' ').title()) cdf3 = pd.DataFrame({ 'concept': [ 'geo_area', 'year', 'name', 'description', 'goal', 'indicator', 'target' ], 'concept_type': [ 'entity_domain', 'time', 'string', 'string', 'string', 'string', 'string' ], 'name': [ 'Geo Area', 'Year', 'Name', 'Description', 'Goal', 'Indicator', 'Target' ] }) cdf_ = cdf2.append(cdf3, ignore_index=True) cdf_ = sort_df(cdf_, 'concept') cdf_.to_csv('../../ddf--concepts--discrete.csv', index=False)
def update_enjson(enj, ddf_concept, graphs): """update the existing en.json with new concepts. enj: source en.json ddf_concept: ddf--concepts of this repo. graphs: graph settings file to get the menu levels indicator. """ trs = ddf_concept[['concept', 'name', 'unit', 'description']] # remove the country groups and dont-panic-poverty concepts, which # will be process differently. trs = trs.iloc[6:-3].copy() # create indicator name, unit, description of each concepts. trs['key_1'] = 'indicator/'+trs['concept'] trs['key_2'] = 'unit/'+trs['concept'] trs['key_3'] = 'description/'+trs['concept'] trs1 = trs.drop('concept', axis=1).set_index('key_1') trs2 = trs.drop('concept', axis=1).set_index('key_2') trs3 = trs.drop('concept', axis=1).set_index('key_3') name = trs1['name'].fillna("") unit = trs2['unit'].fillna("") desc = trs3['description'].fillna("") # check each item in old en.json and name/unit/description series # if an item is not in en.json, insert that item # if the item is in en.json, update that item if the item in en.json # only if the item in en.json is empty. for k, v in name.to_dict().items(): if k not in enj.keys(): enj.update({k: v}) else: if len(enj[k]) == 0: enj.update({k: v}) for k, v in unit.to_dict().items(): if k not in enj.keys(): enj.update({k: v}) else: if len(enj[k]) == 0: enj.update({k: v}) for k, v in desc.to_dict().items(): if k not in enj.keys(): enj.update({k: v}) else: if len(enj[k]) == 0: enj.update({k: v}) # menu levels. levels = graphs[['concept', 'menu_level1', 'menu_level_2']] l1 = levels['menu_level1'].unique() l2 = levels['menu_level_2'].unique() for i in l1: if i is not np.nan: key = to_concept_id(i) enj['indicator/'+key] = i for i in l2: if i is not np.nan: key = to_concept_id(i) enj['indicator/'+key] = i # country groupings c5 = ddf_concept[ddf_concept['concept_type'] == 'entity_set'].copy() for i, v in c5.iterrows(): enj['indicator/'+'geo.'+v['concept']] = v['name'] return enj
def main(): data = json.load(open(source)) # create datapoints dimensions = data['structure']['dimensions'] series = data['dataSets'][0]['series'] index_cols = [x['name'] for x in dimensions['series']] recs = [] name = data['structure']['name'] for i, v in series.items(): idxs = list(map(int, i.split(':'))) idx_dict = dict(zip(index_cols, idxs)) for i, k in enumerate(idx_dict.keys()): assert i == dimensions['series'][i]['keyPosition'] idx_dict[k] = dimensions['series'][i]['values'][idx_dict[k]]['id'] for t, o in v['observations'].items(): # TODO: might be the observations index is a list? year = int(dimensions['observation'][0]['values'][int(t)]['id']) rec = {} for k, v in idx_dict.items(): rec[k] = v rec['year'] = year rec[name] = o[0] recs.append(rec) rdf = pd.DataFrame.from_records(recs) rdf.columns = rdf.columns.map(to_concept_id) rdf.amount_type = rdf.amount_type.map(to_concept_id) # amount_type is in uppercase, fix it rdf = rdf.rename(columns={'aid_oda_by_sector_and_donor_dac5': 'oda'}) # rename the indicator (rdf.set_index(['aid_type', 'amount_type', 'donor', 'sector', 'year']) .to_csv(Path(out_dir, 'ddf--datapoints--oda--by--aid_type--amount_type--donor--sector--year.csv'))) # create entities entities_concepts = [] entities_dfs = {} for d in dimensions['series']: name = d['name'] concept = to_concept_id(name) value_df = pd.DataFrame.from_records(d['values']) value_df.columns = [concept, 'name'] entities_concepts.append({'concept': concept, 'name': name, 'concept_type': 'entity_domain'}) entities_dfs[concept] = value_df for k, v in entities_dfs.items(): path = Path(out_dir, f'ddf--entities--{k}.csv') v[k] = v[k].map(to_concept_id) v.to_csv(path, index=False) # create concepts # manually insert some concepts concepts = [ {'concept': 'oda', 'concept_type': 'measure', 'name': 'Aid (ODA) by sector and donor'}, {'concept': 'year', 'concept_type': 'time', 'name': 'Year'}, {'concept': 'name', 'concept_type': 'string', 'name': 'Name'} ] cdf = pd.DataFrame.from_records([*concepts, *entities_concepts]) cdf.to_csv(Path(out_dir, 'ddf--concepts.csv'), index=False) print('Done.')
def csvs_to_ddf(files, out_path): """convert raw files to ddfcsv Args ---- files: list a list of file paths to build ddf csv out_path: `str` the directory to put the ddf dataset """ import re from os.path import join from ddf_utils.str import to_concept_id concepts_df = pd.DataFrame([['name', 'Name', 'string']], columns=['concept', 'name', 'concept_type']) concepts_df = concepts_df.set_index('concept') all_entities = dict() pattern = r'indicators--by--([ 0-9a-zA-Z_-]*).csv' for f in files: data = pd.read_csv(f) basename = os.path.basename(f) keys = re.match(pattern, basename).groups()[0].split('--') keys_alphanum = list(map(to_concept_id, keys)) # check if there is a time column. Assume last column is time. try: pd.to_datetime(data[keys[-1]], format='%Y') except (ValueError, pd.tslib.OutOfBoundsDatetime): has_time = False else: has_time = True if has_time: ent_keys = keys[:-1] else: ent_keys = keys # set concept type for col in data.columns: concept = to_concept_id(col) if col in keys: if col in ent_keys: t = 'entity_domain' else: t = 'time' else: t = 'measure' concepts_df.loc[concept] = [col, t] for ent in ent_keys: ent_df = data[[ent]].drop_duplicates().copy() ent_concept = to_concept_id(ent) ent_df.columns = ['name'] ent_df[ent_concept] = ent_df.name.map(to_concept_id) if ent_concept not in all_entities.keys(): all_entities[ent_concept] = ent_df else: all_entities[ent_concept] = pd.concat([all_entities[ent_concept], ent_df], ignore_index=True) data = data.set_index(keys) for c in data: # output datapoints df = data[c].copy() df = df.reset_index() for k in keys[:-1]: df[k] = df[k].map(to_concept_id) df.columns = df.columns.map(to_concept_id) (df.dropna() .to_csv(join(out_path, 'ddf--datapoints--{}--by--{}.csv'.format( to_concept_id(c), '--'.join(keys_alphanum))), index=False)) # output concepts concepts_df.to_csv(join(out_path, 'ddf--concepts.csv')) # output entities for c, df in all_entities.items(): df.to_csv(join(out_path, 'ddf--entities--{}.csv'.format(c)), index=False) dp = get_datapackage(out_path, use_existing=False) dump_json(os.path.join(out_path, 'datapackage.json'), dp) return
def conceptID(str): return to_concept_id(camelToSnake(str))
concs = data['Element'].unique() cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type']) cdf['name'] = ['Name', 'Country', 'Item', 'Year', *concs] cdf['concept'] = cdf['name'].map(to_concept_id) cdf.concept_type = 'measure' cdf.loc[0, 'concept_type'] = 'string' cdf.loc[1, 'concept_type'] = 'entity_domain' cdf.loc[2, 'concept_type'] = 'entity_domain' cdf.loc[3, 'concept_type'] = 'time' cdf.to_csv(os.path.join(out_path, 'ddf--concepts.csv'), index=False) # datapoints data_ = data[['Country Code', 'Item Code', 'Element', 'Year Code', 'Value']] gs = data_.groupby('Element').groups for k, idx in gs.items(): cid = to_concept_id(k) df = data_.ix[idx].copy() df = df.drop('Element', axis=1) df.columns = ['country', 'item', 'year', cid] path = os.path.join( out_path, 'ddf--datapoints--{}--by--country--item--year.csv').format(cid) df.to_csv(path, index=False) get_datapackage(out_path, use_existing=True, to_disk=True) print('Done.')
# entities area = data001['Area'].unique() area_id = list(map(to_concept_id, area)) ent = pd.DataFrame([], columns=['area', 'name']) ent['area'] = area_id ent['name'] = area path = os.path.join(out_dir, 'ddf--entities--area.csv') ent.to_csv(path, index=False) # datapoints dps_list = [ 'Life expectancy at birth', 'Life expectancy, with interpolations' ] dps = dict([(x, to_concept_id(x)) for x in dps_list]) for col, col_id in dps.items(): dp = extract_datapoints(data001, col, col_id) path = os.path.join( out_dir, 'ddf--datapoints--{}--by--area--year.csv'.format(col_id)) dp.dropna().sort_values(by=['area', 'year']).to_csv(path, index=False) # data001_dp_1 = data001[['Area', 'Year', 'Total Fertility Rate (TFR), also called Children per Woman']].copy() # data001_dp_2 = data001[['Area', 'Year', 'TFR interpolated']].copy() # data001_dp_1.columns = ['area', 'year', 'total_fertility_rate'] # data001_dp_2.columns = ['area', 'year', 'total_fertility_rate_interpolated'] # data001_dp_1['area'] = data001_dp_1['area'].map(to_concept_id)
# read codebook csv cb = pd.read_csv(cb_csv, skiprows=1) cb = cb.drop('Variable:', axis=1) # unneeded column # read data csv data = pd.read_csv(data_csv) # now begins the etl process names = list() measures = list() data['metric'] = data['metric'].map(to_concept_id) # each metric will act as a measure, and there are 3 parts of data # for each measure for met in cb.metric.drop(0).dropna().unique(): met_id = to_concept_id(met) df = data.groupby(by='metric').get_group(met_id) for i in ['mean', 'lower', 'upper']: # append the measure and measure name lists. measure = '{}_{}'.format(met_id, i) measures.append(measure) if i in ['lower', 'upper']: name = '{}, 95% Uncertainty Interval - {} Bound'.format( met, i.title()) else: name = '{}: Mean'.format(met) names.append(name) # save datapoints df = df.rename(columns={i: measure}) df_out = df[[
# -*- coding: utf-8 -*- import pandas as pd import os from ddf_utils.str import to_concept_id from ddf_utils.index import create_index_file source = '../source/gapdata009.xlsx' out_dir = '../../' # the datapoint name and datapoint id dp_name_sheet = 'data & sources' dp_name = 'Average age at 1st marriage (girls)' dp_id = to_concept_id(dp_name) if __name__ == '__main__': # reading data data001 = pd.read_excel(source, sheetname=dp_name_sheet) data001_dp = data001[['Country', 'Year', 'Data']].copy() data001_dp.columns = ['country', 'year', dp_id] # entities country = data001_dp['country'].unique() country_id = list(map(to_concept_id, country)) ent = pd.DataFrame([], columns=['country', 'name']) ent['country'] = country_id ent['name'] = country path = os.path.join(out_dir, 'ddf--entities--country.csv')
def generate_metadata(ddf_concept, graphs, meta2, area, outdir, oneset=False): """Generate the metadata.json. ddf_concept: ddf--concepts for this repo. graphs: graph settings meta2: the old metadata.json area: area_categorizarion.json outdir: the output dir of datapoints. oneset: if oneset is true, only one entity set(world_4region) will be added. """ # use OrderedDict in order to keep the order of insertion. indb = OrderedDict([['indicatorsDB', OrderedDict()]]) # rename indicator_url to sourceLink ddf_concept = ddf_concept.rename(columns={'indicator_url': 'sourceLink'}) # convert json fields to dict/list object. to_json = lambda x: json.loads(x) if isinstance(x, str) else x ddf_concept['scales'] = ddf_concept['scales'].map(to_json) ddf_concept['color'] = ddf_concept['color'].map(to_json) # geo property geo_list = ['geo', 'name', 'latitude', 'longitude', 'world_4region'] geo_cols = ['scales', 'sourceLink', 'color'] ddf_concept = ddf_concept.set_index('concept') for k in geo_list: values = ddf_concept.loc[[k], geo_cols] values.columns = ['scales', 'sourceLink', 'color'] value_dict = to_dict_dropna(values) if k == 'geo': key = k else: key = 'geo.'+k indb['indicatorsDB'][key] = value_dict[k] indb['indicatorsDB'][key]['use'] = 'property' if 'color' in indb['indicatorsDB'][key].keys(): indb['indicatorsDB'][key]['color'] = indb['indicatorsDB'][key]['color'] # manually add a _default and time indicator indb['indicatorsDB']['time'] = { "use": "indicator", "scales": ["time"], "sourceLink": "" } indb['indicatorsDB']['_default'] = { "use": "constant", "scales": ["ordinal"], "sourceLink": "" } if not oneset: group_data = ddf_concept[ddf_concept['domain'] == 'geo'][geo_cols] group_names = group_data.index group_names = group_names.drop(['country', 'world_4region']) for g in sorted(group_names): value_dict = to_dict_dropna(group_data.ix[[g]]) key = 'geo.'+g indb['indicatorsDB'][key] = value_dict[g] indb['indicatorsDB'][key]['use'] = 'property' if 'color' in indb['indicatorsDB'][key].keys(): indb['indicatorsDB'][key]['color'] = indb['indicatorsDB'][key]['color'] ddf_concept = ddf_concept.reset_index() # all measure types. measure_cols = ['concept', 'sourceLink', 'scales', 'interpolation', 'color'] mdata = ddf_concept[ddf_concept['concept_type'] == 'measure'][measure_cols] mdata = mdata.set_index('concept') mdata = mdata.drop(['longitude', 'latitude']) mdata.columns = ['sourceLink', 'scales', 'interpolation', 'color'] mdata['use'] = 'indicator' mdata_dict = to_dict_dropna(mdata) for k in sorted(mdata_dict.keys()): indb['indicatorsDB'][k] = mdata_dict.get(k) for i in indb['indicatorsDB'].keys(): fname = os.path.join(outdir, 'ddf--datapoints--'+i+'--by--geo--time.csv') try: df = pd.read_csv(fname, dtype={i: float, 'time': int}) except (OSError, IOError): print('no datapoints for ', i) continue # domain and availability dm = [float(df[i].min()), float(df[i].max())] av = [int(df['time'].min()), int(df['time'].max())] # make it zero when the number is too small if np.abs(dm[0]) < 1e-5: dm[0] = 0 if np.abs(av[0]) < 1e-5: av[0] = 0 # domain_quantiles_10_90: # 1) sort by indicator value # 2) remove top and bottom 10% of values (os if 100 points, remove 10 from top and bottom) # 3) take first and last value of what's left as min and max in the property above. values_sorted = df[i].sort_values().values q_10 = int(np.round(len(values_sorted) / 10)) q_90 = -1 * q_10 - 1 # values_sorted = values_sorted[q_10:q_90] # domain_quantiles_10_90 = [values_sorted.min(), values_sorted.max()] domain_quantiles_10_90 = [values_sorted[q_10], values_sorted[q_90]] indb['indicatorsDB'][i].update({ 'domain': dm, 'availability': av, 'domain_quantiles_10_90': domain_quantiles_10_90 }) # newdb = OrderedDict([[key, indb['indicatorsDB'][key]] for key in sorted(indb['indicatorsDB'].keys())]) # indb['indicatorsDB'] = newdb # indicator Trees indb['indicatorsTree'] = OrderedDict([['id', '_root'], ['children', []]]) ti = OrderedDict([['id', 'time']]) pro = OrderedDict([['id', '_properties'], ['children', [{'id': 'geo'}, {'id': 'geo.name'}]]]) indb['indicatorsTree']['children'].append(ti) all_levels = graphs[['concept', 'menu_level1', 'menu_level_2']].sort_values(['menu_level1', 'menu_level_2'], na_position='first') # change nans to something more convenient all_levels['menu_level1'] = all_levels['menu_level1'].apply(to_concept_id).fillna('0') all_levels['menu_level_2'] = all_levels['menu_level_2'].apply(to_concept_id).fillna('1') all_levels = all_levels.set_index('concept') g = all_levels.groupby('menu_level1').groups ks = list(sorted(g.keys())) # move 'for_advanced_user' to the end of list. if 'for_advanced_users' in ks: ks.remove('for_advanced_users') ks.append('for_advanced_users') # loop though all level1 keys, for each key: # if key is nan, insert to the _root tree with {'id': concept_name} # else, insert {'id': concept_name, 'children': []} # then group all concepts with the key as level 1 menu by level 2 menu # loop though each level 2 group and do the same insertion logic as above. for key in ks: if key == '0': # so it's NaN for i in sorted(g[key]): indb['indicatorsTree']['children'].append({'id': i}) # insert _properities entity after the root level menus as requested. indb['indicatorsTree']['children'].append(pro) if not oneset: for i in range(len(area)): key = 'geo.'+to_concept_id(area[i]['n']) # remove geo.geographic_regions_in_4_colors as requested # by Jasper if key == 'geo.geographic_regions_in_4_colors': continue indb['indicatorsTree']['children'][-1]['children'].append({'id': key}) indb['indicatorsTree']['children'][-1]['children'].append({'id': 'geo.world_4region'}) continue od = OrderedDict([['id', key], ['children', []]]) indb['indicatorsTree']['children'].append(od) g2 = all_levels.ix[g[key]].groupby('menu_level_2').groups for key2 in sorted(g2.keys()): if key2 == '1': # it's NaN for i in sorted(g2[key2]): indb['indicatorsTree']['children'][-1]['children'].append({'id': i}) else: od = OrderedDict([['id', key2], ['children', []]]) indb['indicatorsTree']['children'][-1]['children'].append(od) for i in sorted(g2[key2]): indb['indicatorsTree']['children'][-1]['children'][-1]['children'].append({'id': i}) return indb
def cleanup_data(source_dir): all_data = [] for f in os.listdir(source_dir): if 'xls' in f: if '2014' in f: data = pd.read_excel(os.path.join(source_dir, f), skiprows=1, sheetname='CPI 2014') data = data[['Country / Territory', 'Unnamed: 2', 'CPI 2014 Score']] ## unnamed 2 is wbcode data['year'] = 2014 data = data.dropna() data.columns = ['country', 'wbcode', 'cpi', 'year'] all_data.append(data) if '2015' in f: data = pd.read_excel(os.path.join(source_dir, f), sheetname='CPI 2015') data = data[['Country', 'wbcode', 'CPI2015']] data['year'] = 2015 data = data.dropna() data.columns = ['country', 'wbcode', 'cpi', 'year'] all_data.append(data) if '2010' in f: data = pd.read_excel(os.path.join(source_dir, f), skiprows=1, sheetname='CPI table') data = data[['Country / Territory', 'CPI 2010 Score']] data['year'] = 2010 data = data.drop([0, 1]).dropna() data.columns = ['country', 'cpi', 'year'] all_data.append(data) if '2011' in f: data = pd.read_excel(os.path.join(source_dir, f), sheetname='Global') data = data[['Country / Territory', 'CPI 2011 Score']] data['year'] = 2011 data = data.dropna() data.columns = ['country', 'cpi', 'year'] all_data.append(data) if '2012' in f: data = pd.read_excel(os.path.join(source_dir, f), sheetname='CPI 2012') data = data[['Country / Territory', 'CPI 2012 Score']] data['year'] = 2012 data = data.dropna() data.columns = ['country', 'cpi', 'year'] all_data.append(data) if '2013' in f: data = pd.read_excel(os.path.join(source_dir, f), sheetname='CPI 2013') data = data[['Country / Territory', 'Unnamed: 2', 'CPI 2013 Score']] ## unnamed 2 is wbcode data['year'] = 2013 data = data.dropna() data.columns = ['country', 'wbcode', 'cpi', 'year'] all_data.append(data) # concat all data and fill in wbcode column. all_data_df = pd.concat(all_data) all_data_df = all_data_df.reset_index(drop=True) all_data_df.country = all_data_df.country.str.strip() gps = all_data_df.groupby('country') for k, v in gps.groups.items(): df = all_data_df[all_data_df['country'] == k] wbcode_list = df['wbcode'].unique() wbcode_list = [x for x in wbcode_list if x is not np.nan] if len(wbcode_list) == 0: country = to_concept_id(k) else: country = wbcode_list[0] all_data_df.loc[v, 'wbcode'] = country return all_data_df