def extract_entities_groups(regs, gps):
    """extract all country groups entities

    regs: regions.json, contains country/region name and gwid.
    gps: area_categorizarion.json, contains groups name and group levels

    returns a dictionary which keys are group name and values are dataframes.
    """
    res = {}

    regd = {}  # a dictionary, which keys are region id and values are region names
    for i in regs:
        regd[i.get(list(i.keys())[0])] = list(i.keys())[0]

    for i, n in gps.n.apply(to_concept_id).iteritems():
        df = pd.DataFrame([], columns=[n, 'name', 'gwid', 'is--'+n])
        df['gwid'] = gps.iloc[i]['groupings'].keys()
        if i == 4:
            df[n] = df['gwid'].apply(lambda x: to_concept_id(regd[x], sep=''))
        else:
            df[n] = df['gwid'].apply(lambda x: to_concept_id(regd[x]))

            df['name'] = df['gwid'].apply(lambda x: regd[x])
        df['is--'+n] = 'TRUE'
        res[n] = df
    return res
def map_to_id(x):
    if x == str("Stockholms län"):
        return to_concept_id("01 " + x)
    elif x == "Riket":
        return to_concept_id("swe")
    else:
        return to_concept_id(x)
Beispiel #3
0
def extract_datapoints(data):
    conc = 'Annual number of hours actually worked per person'

    dps = data[['Country (code)', 'Year', conc]].copy()
    dps.columns = ['country', 'year', to_concept_id(conc)]
    dps['country'] = dps['country'].map(to_concept_id)

    return to_concept_id(conc), dps.dropna()
Beispiel #4
0
def _read_data(data_file):
    with open(data_file) as f:
        data = etree.parse(f)

    root = data.getroot()

    # get all namespaces from the xml
    nsmap = root.nsmap.copy()
    # change None to a meaningful name, so that I can use later.
    nsmap['xmlns'] = nsmap.pop(None)

    all_data = {}

    all_ser = root.xpath('.//xmlns:Series', namespaces=nsmap)

    for item in all_ser:
        # each series tag contains a time series for a given indicator
        # and country. here we loop though all series and group them into the
        # all_data dict, where keys are indicators and values are a dict of
        # {location: series} for that indicator.
        item_dict = xmltodict.parse(etree.tostring(item))

        attrs = {}
        ser = []

        # getting series attributes: location and indicator id
        for i in item_dict['Series']['SeriesKey']['Value']:
            if i['@concept'] == 'EDULIT_IND':
                attrs['key'] = to_concept_id(i['@value'])
            if i['@concept'] == 'LOCATION':
                attrs['location'] = to_concept_id(i['@value'])

        # get observation data.
        obs = item_dict['Series']['Obs']
        if isinstance(obs, list):
            for o in item_dict['Series']['Obs']:
                ser.append([o['Time'], o['ObsValue']['@value']])
        else:
            ser.append([obs['Time'], obs['ObsValue']['@value']])

        if attrs['key'] not in all_data.keys():
            all_data[attrs['key']] = {attrs['location']: ser}
        else:
            # there should be no duplicates in locations
            assert attrs['location'] not in all_data[attrs['key']].keys()
            all_data[attrs['key']][attrs['location']] = ser

    # concat the list of series for each indicator
    for k, v in all_data.items():
        to_concat = []
        for loc, ser in v.items():
            df = pd.DataFrame(ser, columns=['time', k])
            df['location'] = loc
            to_concat.append(df)

        all_data[k] = pd.concat(to_concat, ignore_index=True)

    return all_data
Beispiel #5
0
def _read_data(data_file):
    with open(data_file) as f:
        data = etree.parse(f)

    root = data.getroot()

    # get all namespaces from the xml
    nsmap = root.nsmap.copy()
    # change None to a meaningful name, so that I can use later.
    nsmap['xmlns'] = nsmap.pop(None)

    all_data = {}

    all_ser = root.xpath('.//xmlns:Series', namespaces=nsmap)

    for item in all_ser:
        # each series tag contains a time series for a given indicator
        # and country. here we loop though all series and group them into the
        # all_data dict, where keys are indicators and values are a dict of
        # {location: series} for that indicator.
        item_dict = xmltodict.parse(etree.tostring(item))

        attrs = {}
        ser = []

        # getting series attributes: location and indicator id
        for i in item_dict['Series']['SeriesKey']['Value']:
            if i['@concept'] == 'EDULIT_IND':
                attrs['key'] = to_concept_id(i['@value'])
            if i['@concept'] == 'LOCATION':
                attrs['location'] = to_concept_id(i['@value'])

        # get observation data.
        obs = item_dict['Series']['Obs']
        if isinstance(obs, list):
            for o in item_dict['Series']['Obs']:
                ser.append([o['Time'], o['ObsValue']['@value']])
        else:
            ser.append([obs['Time'], obs['ObsValue']['@value']])

        if attrs['key'] not in all_data.keys():
            all_data[attrs['key']] = {attrs['location']: ser}
        else:
            # there should be no duplicates in locations
            assert attrs['location'] not in all_data[attrs['key']].keys()
            all_data[attrs['key']][attrs['location']] = ser

    # concat the list of series for each indicator
    for k, v in all_data.items():
        to_concat = []
        for loc, ser in v.items():
            df = pd.DataFrame(ser, columns=['time', k])
            df['location'] = loc
            to_concat.append(df)

        all_data[k] = pd.concat(to_concat, ignore_index=True)

    return all_data
def extract_datapoints(data):
    for k, df in data.items():
        df.columns = list(map(to_concept_id, df.columns))
        df['country'] = df['country'].map(to_concept_id)

        df = df.set_index(['country', 'year'])

        df = df.stack().reset_index()
        df['year'] = df['year'].map(int)

        df.columns = ['country', 'year', 'sex', to_concept_id(k)]

        yield to_concept_id(k), df
Beispiel #7
0
def serve_datapoints_return_measures(data_full: pd.DataFrame, measure: dict,
                                     metric: dict):
    all_measures = []

    groups = data_full.groupby(by=['measure', 'metric'])

    for g in groups.groups:
        name = measure[g[0]] + ' ' + metric[g[1]]
        # print(name)
        concept = to_concept_id(name)
        all_measures.append((concept, name))

        df = groups.get_group(g)
        df = df.rename(columns={'val': concept})
        cause_groups = df.groupby(by='cause')  # split by cause
        cols = ['location', 'sex', 'age', 'cause', 'year', concept]
        df[concept] = df[concept].map(formatter)
        # if concept == 'mmr_rate':
        #     print(df.sex.unique())
        for g_ in cause_groups.groups:
            df_ = cause_groups.get_group(g_)
            # print(g_)
            # print(df_.age.unique())
            # print(len(df_.year.unique()))
            # print(len(df_.location.unique()))
            cause = 'cause-{}'.format(g_)
            by = ['location', 'sex', 'age', cause, 'year']
            file_name = 'ddf--datapoints--' + concept + '--by--' + '--'.join(
                by) + '.csv'
            file_path = osp.join(output_dir, file_name)
            df_[cols].sort_values(
                by=['location', 'sex', 'age', 'year']).to_csv(file_path,
                                                              index=False)
    return all_measures
Beispiel #8
0
def extract_datapoints(data):

    conc = [
        "Labour force ('000)", "Population ('000)",
        "Labour force participation rate (%)"
    ]

    dps = data[[
        'Country (code)', 'Sex (code)', 'Age group (code)', 'Year', *conc
    ]].copy()
    dps.columns = [
        'country', 'sex', 'age_group', 'year',
        *[to_concept_id(_rename_concept(x)) for x in conc]
    ]

    dps['country'] = dps['country'].map(to_concept_id)
    dps['sex'] = dps['sex'].map(to_concept_id)

    dps['age_group'] = dps['age_group'].str.replace('+',
                                                    '_plus').map(to_concept_id)

    dps = dps.set_index(['country', 'sex', 'age_group', 'year'])

    for k, df in dps.items():
        df_ = df.reset_index().dropna()

        yield k, df_
Beispiel #9
0
def main():
    data = pd.read_excel('../source/injury_mortality_trend_tables.xls',
                         sheet_name="Rates",
                         skiprows=4)

    # manually set the cause entity domain name->concept mapping
    m = {
        'Road traffic accidents': 'traffic',
        'Homicide': 'homicide',
        'Self-inflicted injuries': 'suicide'
    }

    data = data.rename(columns={'cause of death': 'cause'})
    data['cause'] = data['cause'].map(m)

    # datapoints
    dps = data.copy()
    dps = dps.drop(['ICD', 'name'], axis=1)
    dps = dps.set_index(['country', 'year', 'cause', 'sex'])
    cols = dps.columns
    for c in dps:
        c_ = to_concept_id(c)
        df = dps[c].copy()
        df.name = c_
        df.reset_index().dropna().to_csv(
            f"../../ddf--datapoints--{c_}--by--country--year--cause--sex.csv",
            index=False)

    # country
    country = data[['country', 'name']].drop_duplicates(subset=['country'],
                                                        keep='first')
    country.to_csv('../../ddf--entities--country.csv', index=False)

    # sex
    sex = pd.DataFrame([[0, 'Both sexes'], [1, 'male'], [2, 'female']],
                       columns=['sex', 'name'])
    sex.to_csv('../../ddf--entities--sex.csv', index=False)

    # cause
    cause = pd.DataFrame.from_dict(m, orient='index').reset_index()
    cause.columns = ['name', 'cause']
    cause.to_csv('../../ddf--entities--cause.csv', index=False)

    # concepts
    cont = cols.map(to_concept_id)
    cont_df = pd.DataFrame.from_dict({'concept': cont, 'name': cols})
    cont_df['concept_type'] = 'measure'

    ent_df = pd.DataFrame(
        [['country', 'Country'], ['sex', 'Sex'], ['cause', 'Cause']],
        columns=['concept', 'name'])
    ent_df['concept_type'] = 'entity_domain'

    other_df = pd.DataFrame(
        [['year', 'Year', 'time'], ['name', 'Name', 'string']],
        columns=['concept', 'name', 'concept_type'])

    concepts_df = pd.concat([cont_df, ent_df, other_df], sort=False)
    concepts_df.to_csv('../../ddf--concepts.csv', index=False)
def extract_entities_country(regs, geo, gps, geo_sg, geo_map=False):
    """if geo_map is True, return a geomap which maps the old country id to new id
    else return the country entities with new id.

    regs: regions.json, contains country/region name and gwid.
    geo: country_synonyms.xlsx, contains all country info
    gps: area_categorizarion.json, contains groups name and group levels
    geo_sg: country entities from systema_globalis
    """
    regd = {}
    for i in regs:
        regd[i.get(list(i.keys())[0])] = list(i.keys())[0]

    geo_ = geo[['ISO3dig_ext', 'Gwid']]
    geo_ = geo_.set_index('Gwid')
    geo_2 = geo.set_index('Gwid').drop('ISO3dig_ext', axis=1)

    country = geo_.copy()

    # loop though all groupings, build a dataframe which gwid is the index and
    # group names are columns.
    for i, n in gps.n.apply(to_concept_id).iteritems():
        res = {}

        for k, v in gps.iloc[i]['groupings'].items():
            for gwid in v:
                if gwid:
                    res[gwid] = to_concept_id(regd[k])

        ser = pd.Series(res)

        country[n] = ser

    # combine the groupings info and other info, and do some cleanups.
    country2 = pd.concat([country, geo_2], axis=1)
    country2 = country2.reset_index()
    country2 = country2.rename(columns={'NAME': 'Upper Case Name', 'Use Name': 'Name', 'ISO3dig_ext': 'country_2'})
    country2.columns = list(map(to_concept_id, country2.columns))
    country2['is--country'] = 'TRUE'

    # adding world_4region data
    country3 = geo_sg[['geo', 'world_4region', 'latitude', 'longitude', 'name']]
    country3 = country3.rename(columns={'geo': 'country'}).set_index('name')

    # the final dataframe
    country4 = pd.concat([country2.set_index('name'), country3], axis=1)
    country4 = country4.reset_index()
    country4 = country4.rename(columns={'index': 'name'})

    if not geo_map:
        country4 = country4.drop('country_2', axis=1)
        cols = country4.columns.drop(['country', 'gwid', 'name'])
        ex_col = np.r_[['country', 'gwid', 'name'], cols]
        return country4.loc[:, ex_col]
    else:
        country4 = country4.set_index('country_2')
        return country4['country']
def concept_id(obj, renames={}, dict_value=True, dict_key=True):
    if isinstance(obj, list):
        return [concept_id(x, renames) for x in obj]
    if isinstance(obj, dict):
        return {
            concept_id(key, renames) if dict_key else key
            :
            concept_id(value, renames) if dict_value else value
            for (key,value) in obj.items()
        }
    return to_concept_id(underscore(rename(obj, renames)))
Beispiel #12
0
def extract_datapoints(all_data):
    for df in all_data:
        for g, ids in df.groupby('Variable Name').groups.items():
            df_concept = df.ix[ids].copy()
            concept = to_concept_id(g)

            df_concept['area'] = df['Area'].map(to_concept_id)
            df_concept = df_concept.rename(columns={'Value': concept, 'Year': 'year'})
            df_yield = df_concept[['area', 'year', concept]].copy()

            yield concept, df_yield.drop_duplicates()
Beispiel #13
0
def main():
    md = ihme.load_metadata()
    metric = md['metric'].copy()
    measure = md['measure'].copy()

    # datapoints
    datapoint_output_dir = osp.join(output_dir, 'deaths')
    os.makedirs(datapoint_output_dir, exist_ok=True)

    data_full = dd.from_delayed([dask.delayed(load_data)(f) for f in os.listdir(source_dir) if f.endswith('.zip')], meta=DTYPES)

    metric = metric.set_index('id')['name'].to_dict()
    measure = measure.set_index('id')['short_name'].to_dict()

    all_measures = list()
    measure_metric_combinations = product(MEASURES, METRICS)
    for g in measure_metric_combinations:
        name = measure[g[0]] + ' ' + metric[g[1]]
        print(f'creating dattpoints for {name}')
        concept = to_concept_id(name)
        all_measures.append((concept, name))

        cols = ['location', 'sex', 'age', 'cause', 'year', 'val']
        df = data_full.loc[(data_full.measure == g[0]) & (data_full.metric == g[1]), cols].compute()
        serve_datapoint(df, concept)

    # entities
    serve_entities(md)

    # concepts
    cont_cdf = pd.DataFrame(all_measures, columns=['concept', 'name'])
    cont_cdf['concept_type'] = 'measure'
    cont_cdf.to_csv('../../ddf--concepts--continuous.csv', index=False)

    dis_cdf = pd.DataFrame([
        ['name', 'Name', 'string'],
        ['short_name', 'Short Name', 'string'],
        ['medium_name', 'Medium Name', 'string'],
        ['long_name', 'Long Name', 'string'],
        ['location', 'Location', 'entity_domain'],
        ['sex', 'Sex', 'entity_domain'],
        ['age', 'Age', 'entity_domain'],
        ['cause', 'Cause', 'entity_domain'],
        ['rei', 'Risk/Etiology/Impairment', 'entity_domain'],
        ['label', 'Label', 'string'],
        ['year', 'Year', 'time'],
        ['type', 'Type', 'string']
    ], columns=['concept', 'name', 'concept_type'])

    dis_cdf.sort_values(by='concept').to_csv('../../ddf--concepts--discrete.csv', index=False)

    print("Done.")
Beispiel #14
0
def main():
    md = ihme.load_metadata()
    metric = md['metric'].copy()
    measure = md['measure'].copy()

    # datapoints
    datapoint_output_dir = osp.join(output_dir, 'deaths')
    os.makedirs(datapoint_output_dir, exist_ok=True)

    data_full = dd.from_delayed([dask.delayed(load_data)(f) for f in os.listdir(source_dir) if f.endswith('.zip')], meta=DTYPES)

    metric = metric.set_index('id')['name'].to_dict()
    measure = measure.set_index('id')['short_name'].to_dict()

    all_measures = list()
    measure_metric_combinations = product(MEASURES, METRICS)
    for g in measure_metric_combinations:
        name = measure[g[0]] + ' ' + metric[g[1]]
        print(f'creating dattpoints for {name}')
        concept = to_concept_id(name)
        all_measures.append((concept, name))

        cols = ['location', 'sex', 'age', 'cause', 'year', 'val']
        df = data_full.loc[(data_full.measure == g[0]) & (data_full.metric == g[1]), cols].compute()
        serve_datapoint(df, concept)

    # entities
    serve_entities(md)

    # concepts
    cont_cdf = pd.DataFrame(all_measures, columns=['concept', 'name'])
    cont_cdf['concept_type'] = 'measure'
    cont_cdf.to_csv('../../ddf--concepts--continuous.csv', index=False)

    dis_cdf = pd.DataFrame([
        ['name', 'Name', 'string'],
        ['short_name', 'Short Name', 'string'],
        ['medium_name', 'Medium Name', 'string'],
        ['long_name', 'Long Name', 'string'],
        ['location', 'Location', 'entity_domain'],
        ['sex', 'Sex', 'entity_domain'],
        ['age', 'Age', 'entity_domain'],
        ['cause', 'Cause', 'entity_domain'],
        ['rei', 'Risk/Etiology/Impairment', 'entity_domain'],
        ['label', 'Label', 'string'],
        ['year', 'Year', 'time'],
        ['type', 'Type', 'string']
    ], columns=['concept', 'name', 'concept_type'])

    dis_cdf.sort_values(by='concept').to_csv('../../ddf--concepts--discrete.csv', index=False)

    print("Done.")
def extract_datapoints(all_data):
    for df in all_data:
        for g, ids in df.groupby('Variable Name').groups.items():
            df_concept = df.ix[ids].copy()
            concept = to_concept_id(g)

            df_concept['area'] = df['Area'].map(to_concept_id)
            df_concept = df_concept.rename(columns={
                'Value': concept,
                'Year': 'year'
            })
            df_yield = df_concept[['area', 'year', concept]].copy()

            yield concept, df_yield.drop_duplicates()
def extract_concepts(data):
    all_concepts = [x for x in data.keys()]
    all_concept_ids = [to_concept_id(x) for x in all_concepts]

    concepts = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])
    concepts['concept'] = ['name', 'year', 'sex', 'country', *all_concept_ids]
    concepts['name'] = ['Name', 'Year', 'Sex', 'Country', *all_concepts]

    concepts['concept_type'] = 'measure'
    concepts.iloc[0]['concept_type'] = 'string'
    concepts.iloc[1]['concept_type'] = 'time'
    concepts.iloc[2]['concept_type'] = 'entity_domain'
    concepts.iloc[3]['concept_type'] = 'entity_domain'

    return concepts
Beispiel #17
0
def extract_datapoints(data):
    dps = data[['cname', 'Time Period', 'Area ID', 'Data Value']].copy()
    dps.columns = ['concept', 'year', 'area', 'data']
    dps['area'] = dps['area'].map(to_concept_id)
    dps['concept'] = dps['concept'].map(lambda x: to_concept_id(x, '[/ -\\.\\*";:]+'))
    dps_gps = dps.groupby(by='concept')

    for k, idx in dps_gps.groups.items():
        df = dps.ix[idx][['year', 'area', 'data']].copy()
        df.columns = ['year', 'area', k]

        # assert(np.all(df[['year', 'area']].duplicated()) == False)

        df = df.sort_values(by=['area', 'year'])

        yield k, df
Beispiel #18
0
def extract_concepts(data):
    discs = ['Name', 'Year', 'Country', 'Sex']

    conc = data.columns[12:28]

    cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

    cdf['name'] = [*discs, *conc]
    cdf['concept'] = cdf['name'].map(lambda x: to_concept_id(_rename_concept(x)))

    cdf.loc[4:, 'concept_type'] = 'measure'
    cdf.loc[0, 'concept_type'] = 'string'
    cdf.loc[1, 'concept_type'] = 'time'
    cdf.loc[2:3, 'concept_type'] = 'entity_domain'

    return cdf
def extract_concepts(data):
    discs = ['Name', 'Year', 'Country']

    conc = data.columns[8:12]

    cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

    cdf['name'] = [*discs, *conc]
    cdf['concept'] = cdf['name'].map(lambda x: to_concept_id(_rename_concept(x)))

    cdf.loc[4:, 'concept_type'] = 'measure'
    cdf.loc[0, 'concept_type'] = 'string'
    cdf.loc[1, 'concept_type'] = 'time'
    cdf.loc[2:3, 'concept_type'] = 'entity_domain'

    return cdf
Beispiel #20
0
def extract_datapoints(data):

    conc = data.columns[8:16]

    dps = data[['Country (code)', 'Year', *conc]].copy()
    dps.columns = ['country', 'year',
                   *[to_concept_id(_rename_concept(x)) for x in conc]]

    dps['country'] = dps['country'].map(to_concept_id)

    dps = dps.set_index(['country', 'year'])

    for k, df in dps.items():
        df_ = df.reset_index().dropna()

        yield k, df_
def extract_datapoints(data):

    conc = data.columns[8:12]

    dps = data[['Country (code)', 'Year', *conc]].copy()
    dps.columns = ['country', 'year',
                   *[to_concept_id(_rename_concept(x)) for x in conc]]

    dps['country'] = dps['country'].map(to_concept_id)

    dps = dps.set_index(['country', 'year'])

    for k, df in dps.items():
        df_ = df.reset_index().dropna()

        yield k, df_
Beispiel #22
0
def extract_concepts(data):
    conc = data[['cname', 'Unit']].copy()
    conc = conc.drop_duplicates()
    conc.columns = ['name', 'unit']
    conc['concept_type'] = 'measure'
    conc['concept'] = conc['name'].map(lambda x: to_concept_id(x, '[/ -\\.\\*";:]+'))

    # manually create discrete concepts
    disc = pd.DataFrame([['Name', np.nan, 'string', 'name'],
                        ['Year', 'year', 'time', 'year'],
                        ['Area', np.nan, 'entity_domain', 'area'],
                        ['Unit', np.nan, 'string', 'unit']],
                        columns=conc.columns)

    concept = pd.concat([disc, conc])

    return concept[['concept', 'name', 'concept_type', 'unit']]
Beispiel #23
0
def extract_datapoints_country_year(data):
    """extract datapoints for each concept by country and year"""

    # first, construct a dict that contains all metrics as key and a list of
    # columns related to a metric as value of a key.
    # we will later pass the dict to data.loc[: col[key]] to get all data
    # point for a metric.

    metrics = []
    for i in data.columns[3:]:
        s = i[:-5]
        if s not in metrics:
            metrics.append(s)

    col = {}
    for m in metrics:
        col[m] = list(filter(lambda x: x.startswith(m), data.columns))

    # now we loop through each metrics and create data frame.
    res = {}
    for m in metrics:
        col_metric = np.r_[data.columns[:3], col[m]]
        # change the column form metirc.year to year
        col_metric_new = list(map(lambda x: x[-4:], col[m]))
        col_metric_new = np.r_[data.columns[:3], col_metric_new]

        data_metric = data[col_metric].copy()
        data_metric.columns = col_metric_new

        gs = data_metric.groupby(by='Uncertainty bounds*').groups

        for p in ['Lower', 'Median', 'Upper']:
            name = to_concept_id(m+'.'+p)
            headers = ['country', 'year', name]
            data_bound = data_metric.ix[gs[p]]
            data_bound = data_bound.set_index('ISO Code')
            data_bound = data_bound.T['1950':]   # the data from source start from 1950
            data_bound = data_bound.unstack().reset_index().dropna()

            data_bound.columns = headers
            data_bound['country'] = data_bound['country'].map(to_concept_id)

            res[name] = data_bound

    return res
def extract_datapoints(data):

    conc = data.columns[14:19]

    dps = data[['Country (code)', 'Sex (code)', 'Age group (code)', 'Year', *conc]].copy()
    dps.columns = ['country', 'sex', 'age_group', 'year',
                   *[to_concept_id(_rename_concept(x)) for x in conc]]

    dps['country'] = dps['country'].map(to_concept_id)
    dps['sex'] = dps['sex'].map(to_concept_id)

    dps['age_group'] = dps['age_group'].str.replace('+', '_plus').map(to_concept_id)

    dps = dps.set_index(['country', 'sex', 'age_group', 'year'])

    for k, df in dps.items():
        df_ = df.reset_index().dropna()

        yield k, df_
Beispiel #25
0
def process_source_files():
    """create datapoints from source files and return all concepts"""
    indi_list = []
    indi_desc_list = []

    indi = load_indicator_list()

    print('creating datapoint files...')
    # import ipdb; ipdb.set_trace()
    for i in indi['GHO']['Metadata']['Dimension']['Code']:
        path = os.path.join(source_dir, i['@Label']+'.csv')
        concept = to_concept_id(i['@Label'])
        if os.path.exists(path):
            try:
                df = pd.read_csv(path)
            except FileNotFoundError:
                print(f'{path} not found')
                continue
            except pd.errors.EmptyDataError:
                print(f'{path} has no data')
                continue
        result, reason = can_proceed(df)
        if result is False:
            print(f'{concept} skipped: {reason}')
            continue
        else:
            create_datapoint(df, concept)
            indi_list.append(concept)
            indi_desc_list.append(i['Display'])

    print('creating concept file...')
    conc = pd.DataFrame([], columns=['concept', 'concept_type', 'name'])
    conc['concept'] = indi_list
    conc['name'] = indi_desc_list
    conc['concept_type'] = 'measure'

    conc = conc.append(pd.DataFrame([['name', 'string', 'Name'],
                                    ['year', 'time', 'Year'],
                                    ['country', 'entity_domain', 'Country']], columns=conc.columns))

    conc_path = os.path.join(out_dir, 'ddf--concepts.csv')
    conc.sort_values(by='concept').to_csv(conc_path, index=False)
Beispiel #26
0
def extract_concepts(data):
    discs = ['Name', 'Year', 'Country', 'Sex', 'Age Group']

    conc = [
        "Labour force ('000)", "Population ('000)",
        "Labour force participation rate (%)"
    ]

    cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])

    cdf['name'] = [*discs, *conc]
    cdf['concept'] = cdf['name'].map(
        lambda x: to_concept_id(_rename_concept(x)))

    cdf.loc[5:, 'concept_type'] = 'measure'
    cdf.loc[0, 'concept_type'] = 'string'
    cdf.loc[1, 'concept_type'] = 'time'
    cdf.loc[2:4, 'concept_type'] = 'entity_domain'

    return cdf
def create_datapoints(data, indicator_mapping, method_mapping):
    data_df = data[[
        'Country or area', 'Indicator',
        'Median estimate and uncertainty intervals', 'DataValue', 'Year'
    ]]
    data_df = data_df[data_df['Median estimate and uncertainty intervals'] ==
                      'MEDIAN ESTIMATE (adjusted)']
    data_df = data_df[['Country or area', 'Year', 'Indicator',
                       'DataValue']].copy()
    data_df['concept'] = data_df['Indicator'].map(
        lambda x: indicator_mapping[x]['indicator'])
    data_df['method'] = data_df['Indicator'].map(
        lambda x: indicator_mapping[x]['method'])

    data_df.columns = ['country', 'year', 'i', 'val', 'concept', 'method']

    data_df = data_df[['country', 'year', 'concept', 'method', 'val']]

    gs = data_df.groupby('concept')

    for c, df_ in gs:
        c_id = to_concept_id(c)
        df = df_.copy()
        df = df.drop('concept', axis=1)
        df['country'] = df['country'].map(to_concept_id)
        df['method'] = df['method'].map(method_mapping)
        df.columns = ['country', 'year', 'method', c_id]
        df = df[['country', 'method', 'year', c_id]]

        if df['method'].dropna().empty:
            df = df.drop('method', axis=1)
            df.to_csv(
                '../../ddf--datapoints--{}--by--country--year.csv'.format(
                    c_id),
                index=False)
        else:
            df.to_csv(
                '../../ddf--datapoints--{}--by--country--method--year.csv'.
                format(c_id),
                index=False)
    area = data001['area'].copy()
    area_id = area.map(to_concept_id)

    area_df = pd.DataFrame([], columns=['area', 'name'])
    area_df['area'] = area_id
    area_df['name'] = area

    path = os.path.join(out_dir, 'ddf--entities--area.csv')
    area_df.to_csv(path, index=False)

    # datapoints
    dp = data001.set_index('area')
    dp = dp.T.unstack()
    dp = dp.reset_index()

    dp.columns = ['area', 'year', to_concept_id('GDP per capita')]
    dp['area'] = dp['area'].map(to_concept_id)

    path = os.path.join(out_dir, 'ddf--datapoints--gdp_per_capita--by--area--year.csv')
    dp.dropna().to_csv(path, index=False)

    # concepts
    conc = ['gdp_per_capita', 'area', 'year', 'name']
    cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])
    cdf['concept'] = conc
    cdf['name'] = ['GDP per capita', 'Area', 'Year', 'Name']
    cdf['concept_type'] = ['measure', 'entity_domain', 'time', 'string']

    path = os.path.join(out_dir, 'ddf--concepts.csv')
    cdf.to_csv(path, index=False)
Beispiel #29
0
    # entities
    area = data001['Area'].unique()
    area_id = list(map(to_concept_id, area))
    ent = pd.DataFrame([], columns=['area', 'name'])
    ent['area'] = area_id
    ent['name'] = area

    path = os.path.join(out_dir, 'ddf--entities--area.csv')
    ent.to_csv(path, index=False)

    # datapoints
    dps = {
        'Total Fertility Rate (TFR), also called Children per Woman': 'total_fertility_rate',
        'TFR interpolated': 'total_fertility_rate_interpolated',
        'Crude Birth Rate (CBR)': 'crude_birth_rate',
        'Princeton If index': to_concept_id('Princeton If index')
    }

    for col, col_id in dps.items():
        dp = extract_datapoints(data001, col, col_id)
        path = os.path.join(out_dir, 'ddf--datapoints--{}--by--area--year.csv'.format(col_id))

        dp.dropna().sort_values(by=['area', 'year']).to_csv(path, index=False)

    # data001_dp_1 = data001[['Area', 'Year', 'Total Fertility Rate (TFR), also called Children per Woman']].copy()
    # data001_dp_2 = data001[['Area', 'Year', 'TFR interpolated']].copy()

    # data001_dp_1.columns = ['area', 'year', 'total_fertility_rate']
    # data001_dp_2.columns = ['area', 'year', 'total_fertility_rate_interpolated']

    # data001_dp_1['area'] = data001_dp_1['area'].map(to_concept_id)
Beispiel #30
0
def test_to_concept_id(s):
    from ddf_utils.str import to_concept_id

    res = to_concept_id(s)
    if res:
        assert re.match(r'[0-9a-z_]*', res)
def main():
    country_df = pd.read_csv('../../ddf--entities--geo--country.csv', dtype=str)
    synonyms_dict = pd.read_csv('../../ddf--synonyms--geo.csv', dtype=str).set_index('synonym').geo.to_dict()
    
    r = requests.get(UN_SDG_URL)
    regions_tree = r.json()
    regions_flat = flatten(regions_tree[0]) # 0 = world

    # Least Developed Countries
    ldc_entities = [ {
        'un_sdg_ldc': 'un_least_developed',
        'name': regions_tree[1]['geoAreaName'],
        'is--un_sdg_ldc': 'TRUE'
    },{
        'un_sdg_ldc': 'un_not_least_developed',
        'name': 'Other UN Countries',
        'is--un_sdg_ldc': 'TRUE'
    }]
    ldc_countries = []
    ldc_set = set()
    for country in regions_tree[1]['children']:
        code = str(country['geoAreaCode'])
        if code in synonyms_dict:
            ldc_countries.append({
                'country': synonyms_dict[code],
                'un_sdg_ldc': 'un_least_developed'
            })
            ldc_set.add(code)
        else:
            print('Could not find synonym for ', country)
    for country in regions_flat[1]['children']:
        code = str(country['geoAreaCode'])
        if code in synonyms_dict and code not in ldc_set:
            ldc_countries.append({
                'country': synonyms_dict[code],
                'un_sdg_ldc': 'un_not_least_developed'
            })

    # SDG Regions
    regions = []
    region_countries = []
    for region_id, subregions in region_composition.items():
        region_name = regions_flat[region_id]['geoAreaName']
        region_entity_id = 'un_' + to_concept_id(region_name)
        region = {
            'un_sdg_region': region_entity_id,
            'name': region_name,
            'color': '#' + region_color[region_id],
            'is--un_sdg_region': 'TRUE'
        }
        regions.append(region)

        subregions.append(region_id)
        for subregion in subregions:
            for country in regions_flat[subregion]['children']:
                code = str(country['geoAreaCode'])
                if code in synonyms_dict:
                    region_countries.append({
                        'country': synonyms_dict[code],
                        'un_sdg_region': region_entity_id
                    })
                else:
                    print('Could not find synonym for ', country)
    
    

    # un sdg region entity set
    regions_df = pd.DataFrame.from_records(regions)
    regions_df.to_csv('../../ddf--entities--geo--un_sdg_region.csv', index=False)
    
    # un sdg ldc entity set
    ldc_df = pd.DataFrame.from_records(ldc_entities)
    ldc_df.to_csv('../../ddf--entities--geo--un_sdg_ldc.csv', index=False)

    # update country properties
    country_df = country_df.set_index('country')
    region_countries = pd.DataFrame.from_records(region_countries).set_index('country')
    ldc_countries = pd.DataFrame.from_records(ldc_countries).set_index('country')

    country_df['un_sdg_region'] = region_countries.reindex(country_df.index)['un_sdg_region']
    country_df['un_sdg_ldc'] = ldc_countries.reindex(country_df.index)['un_sdg_ldc']
    country_df.to_csv('../../ddf--entities--geo--country.csv')
def concept_id(str):
    return to_concept_id(underscore(str))
Beispiel #33
0
def process_file(zf, f, domains, flag_cat, geos):
    """process a file in zf, create datapoints files and return all concepts"""
    concs = []
    # file_contents = zf.read(f)
    tmpfile = mktemp()
    with open(tmpfile, 'wb') as tf:
        with zf.open(f) as z:
            # print(tmpfile)
            tf.write(z.read())
            tf.flush()
    # load the actual csv from the zipped file.
    zf2 = zipfile.ZipFile(tmpfile)
    fn_data_csv = guess_data_filename(zf2)
    data_csv = BytesIO(zf2.read(fn_data_csv))
    df = pd.read_csv(data_csv, encoding='latin1', dtype=DEFAULT_DTYPES)

    def starts_with_char(x):
        return re.match('[a-zA-Z].*', x)

    def is_good_length(x):
        return len(x) < 80

    def add_domain(x):
        return ' '.join([domains[f], x])

    if 'Element' in df.columns:
        groups = df.groupby(['Item Code', 'Element Code'])
    else:
        groups = df.groupby('Item Code')

    for g, df_g in groups:
        if 'Area Code' in df.columns:
            country_col = 'Area Code'
        elif 'Country Code' in df.columns:
            country_col = 'Country Code'
        elif 'CountryCode' in df.columns:
            country_col = 'CountryCode'
        else:
            print("Error: column layout not supportted")
            raise KeyError(df.columns)

        df_ = df_g[[country_col, 'Year', 'Value', 'Unit', 'Flag']].copy()
        item_name = df_g.iloc[0]['Item']

        if isinstance(g, tuple):  # groupby item code and element code
            element_name = df_g.iloc[0]['Element']
            item_code = str(g[0])
            if starts_with_char(item_code) and is_good_length(item_code):
                concept_fullname = ' - '.join([item_code, element_name])
            elif is_good_length(item_name):
                concept_fullname = ' - '.join([item_name, element_name])
            else:
                concept_fullname = ' - '.join([item_code, element_name])
            indicator = ' - '.join([item_name, element_name])
        else:  # only group by item code
            item_code = str(g)
            if starts_with_char(item_code) and is_good_length(item_code):
                concept_fullname = item_code
            elif is_good_length(item_name):
                concept_fullname = item_name
            else:
                concept_fullname = item_code
            indicator = item_name

        concept_id = to_concept_id(add_domain(concept_fullname))

        df_.columns = ['geo', 'year', concept_id, 'unit', 'flag']

        df_ = df_.dropna(subset=[concept_id])

        # don't include geos not in geo domain
        df_ = df_[df_['geo'].isin(geos)]

        if df_.empty:  # no content
            continue
        if len(df_['unit'].unique()) > 1:
            print('unit not unique:', concept_id, df_['unit'].unique())
            continue  # don't proceed these indicators

        unit = df_['unit'].unique()[0]
        concs.append({'name': indicator, 'concept': concept_id, 'unit': unit})

        df_['flag'] = df_['flag'].fillna('_')
        df_['flag'] = df_['flag'].astype(flag_cat)
        df_ = df_.sort_values(by='flag').drop_duplicates(
            subset=['geo', 'year'], keep='first')

        if df_[df_.duplicated(subset=['geo', 'year'])].shape[0] > 0:
            print('duplicated found in {}'.format(concept_id))

        df_ = df_[['geo', 'year', concept_id]]
        try:
            df_[concept_id] = df_[concept_id].map(format_float_digits)
        except decimal.InvalidOperation:
            print(f"{concept_id} values seems not decimals")
        df_['geo'] = df_['geo'].astype(str)
        os.makedirs(osp.join(out_dir, 'datapoints', domains[f]), exist_ok=True)
        (df_.sort_values(by=['geo', 'year']).to_csv(osp.join(
            out_dir, 'datapoints', domains[f],
            'ddf--datapoints--{}--by--geo--year.csv'.format(concept_id)),
                                                    index=False))

    # finally remove the temp file
    del (zf2)
    os.remove(tmpfile)

    return concs
Beispiel #34
0
def main():
    entities = {}

    for f in os.listdir('../source'):
        if not f.endswith('.csv'):
            continue

        name = f[:-4]
        concept = name.lower()
        print(concept)

        df = read_source(name)
        if df is None:
            print("\tno data")
            continue
        key_columns = get_key_columns(df)
        df = check_source(df, key_columns)

        # rename column names to their ddf concept id
        column_map = {'TimePeriod': 'year', 'GeoAreaCode': 'geo_area'}
        for c in key_columns[2:]:
            column_map[c] = to_concept_id(c)
        column_map['Value'] = concept

        df_ = df[[*key_columns, 'Value']].copy()
        df_.columns = [column_map[c] for c in df_.columns]

        # entities
        if len(key_columns) > 2:
            for c in df_.columns[2:-1]:
                if c in entities:
                    entities[c] = entities[c].append(
                        create_entity(c, df_[c].unique()))
                else:
                    entities[c] = create_entity(c, df_[c].unique())
                # convert key columns into concept IDs
                df_[c] = df_[c].map(to_concept_id)
        serve_datapoints(df_, concept)

    # entities
    serve_entities(entities)

    # geo entity, from the api
    gdf = create_geo_entity()
    gdf = sort_df(gdf, 'geo_area')
    gdf.to_csv('../../ddf--entities--geo_area.csv', index=False)

    # concepts
    cdf = create_measure_concepts()
    cdf = sort_df(cdf, 'concept')
    cdf.to_csv('../../ddf--concepts--continuous.csv', index=False)

    cdf2 = pd.DataFrame({'concept': list(entities.keys())})
    cdf2['concept_type'] = 'entity_domain'
    cdf2['name'] = cdf2['concept'].map(lambda x: x.replace('_', ' ').title())

    cdf3 = pd.DataFrame({
        'concept': [
            'geo_area', 'year', 'name', 'description', 'goal', 'indicator',
            'target'
        ],
        'concept_type': [
            'entity_domain', 'time', 'string', 'string', 'string', 'string',
            'string'
        ],
        'name': [
            'Geo Area', 'Year', 'Name', 'Description', 'Goal', 'Indicator',
            'Target'
        ]
    })
    cdf_ = cdf2.append(cdf3, ignore_index=True)
    cdf_ = sort_df(cdf_, 'concept')
    cdf_.to_csv('../../ddf--concepts--discrete.csv', index=False)
def update_enjson(enj, ddf_concept, graphs):
    """update the existing en.json with new concepts.

    enj: source en.json
    ddf_concept: ddf--concepts of this repo.
    graphs: graph settings file to get the menu levels indicator.
    """
    trs = ddf_concept[['concept', 'name', 'unit', 'description']]
    # remove the country groups and dont-panic-poverty concepts, which
    # will be process differently.
    trs = trs.iloc[6:-3].copy()
    # create indicator name, unit, description of each concepts.
    trs['key_1'] = 'indicator/'+trs['concept']
    trs['key_2'] = 'unit/'+trs['concept']
    trs['key_3'] = 'description/'+trs['concept']

    trs1 = trs.drop('concept', axis=1).set_index('key_1')
    trs2 = trs.drop('concept', axis=1).set_index('key_2')
    trs3 = trs.drop('concept', axis=1).set_index('key_3')

    name = trs1['name'].fillna("")
    unit = trs2['unit'].fillna("")
    desc = trs3['description'].fillna("")

    # check each item in old en.json and name/unit/description series
    # if an item is not in en.json, insert that item
    # if the item is in en.json, update that item if the item in en.json
    # only if the item in en.json is empty.
    for k, v in name.to_dict().items():
        if k not in enj.keys():
            enj.update({k: v})
        else:
            if len(enj[k]) == 0:
                enj.update({k: v})

    for k, v in unit.to_dict().items():
        if k not in enj.keys():
            enj.update({k: v})
        else:
            if len(enj[k]) == 0:
                enj.update({k: v})

    for k, v in desc.to_dict().items():
        if k not in enj.keys():
            enj.update({k: v})
        else:
            if len(enj[k]) == 0:
                enj.update({k: v})

    # menu levels.
    levels = graphs[['concept', 'menu_level1', 'menu_level_2']]

    l1 = levels['menu_level1'].unique()
    l2 = levels['menu_level_2'].unique()

    for i in l1:
        if i is not np.nan:
            key = to_concept_id(i)
            enj['indicator/'+key] = i

    for i in l2:
        if i is not np.nan:
            key = to_concept_id(i)
            enj['indicator/'+key] = i

    # country groupings
    c5 = ddf_concept[ddf_concept['concept_type'] == 'entity_set'].copy()
    for i, v in c5.iterrows():
        enj['indicator/'+'geo.'+v['concept']] = v['name']

    return enj
Beispiel #36
0
def main():
    data = json.load(open(source))

    # create datapoints
    dimensions = data['structure']['dimensions']
    series = data['dataSets'][0]['series']

    index_cols = [x['name'] for x in dimensions['series']]

    recs = []
    name = data['structure']['name']

    for i, v in series.items():
        idxs = list(map(int, i.split(':')))
        idx_dict = dict(zip(index_cols, idxs))
        for i, k in enumerate(idx_dict.keys()):
            assert i == dimensions['series'][i]['keyPosition']
            idx_dict[k] = dimensions['series'][i]['values'][idx_dict[k]]['id']
        for t, o in v['observations'].items():
            # TODO: might be the observations index is a list?
            year = int(dimensions['observation'][0]['values'][int(t)]['id'])

            rec = {}
            for k, v in idx_dict.items():
                rec[k] = v
            rec['year'] = year
            rec[name] = o[0]
            recs.append(rec)

    rdf = pd.DataFrame.from_records(recs)
    rdf.columns = rdf.columns.map(to_concept_id)
    rdf.amount_type = rdf.amount_type.map(to_concept_id)  # amount_type is in uppercase, fix it
    rdf = rdf.rename(columns={'aid_oda_by_sector_and_donor_dac5': 'oda'}) # rename the indicator

    (rdf.set_index(['aid_type', 'amount_type', 'donor', 'sector', 'year'])
        .to_csv(Path(out_dir,
                     'ddf--datapoints--oda--by--aid_type--amount_type--donor--sector--year.csv')))

    # create entities
    entities_concepts = []
    entities_dfs = {}

    for d in dimensions['series']:
        name = d['name']
        concept = to_concept_id(name)

        value_df = pd.DataFrame.from_records(d['values'])
        value_df.columns = [concept, 'name']

        entities_concepts.append({'concept': concept, 'name': name, 'concept_type': 'entity_domain'})
        entities_dfs[concept] = value_df

    for k, v in entities_dfs.items():
        path = Path(out_dir, f'ddf--entities--{k}.csv')
        v[k] = v[k].map(to_concept_id)
        v.to_csv(path, index=False)

    # create concepts

    # manually insert some concepts
    concepts = [
        {'concept': 'oda', 'concept_type': 'measure', 'name': 'Aid (ODA) by sector and donor'},
        {'concept': 'year', 'concept_type': 'time', 'name': 'Year'},
        {'concept': 'name', 'concept_type': 'string', 'name': 'Name'}
    ]

    cdf = pd.DataFrame.from_records([*concepts, *entities_concepts])
    cdf.to_csv(Path(out_dir, 'ddf--concepts.csv'), index=False)

    print('Done.')
Beispiel #37
0
def csvs_to_ddf(files, out_path):
    """convert raw files to ddfcsv

    Args
    ----
    files: list
        a list of file paths to build ddf csv
    out_path: `str`
        the directory to put the ddf dataset

    """
    import re
    from os.path import join
    from ddf_utils.str import to_concept_id

    concepts_df = pd.DataFrame([['name', 'Name', 'string']],
                               columns=['concept', 'name', 'concept_type'])
    concepts_df = concepts_df.set_index('concept')

    all_entities = dict()

    pattern = r'indicators--by--([ 0-9a-zA-Z_-]*).csv'

    for f in files:
        data = pd.read_csv(f)
        basename = os.path.basename(f)
        keys = re.match(pattern, basename).groups()[0].split('--')
        keys_alphanum = list(map(to_concept_id, keys))

        # check if there is a time column. Assume last column is time.
        try:
            pd.to_datetime(data[keys[-1]], format='%Y')
        except (ValueError, pd.tslib.OutOfBoundsDatetime):
            has_time = False
        else:
            has_time = True

        if has_time:
            ent_keys = keys[:-1]
        else:
            ent_keys = keys

        # set concept type
        for col in data.columns:
            concept = to_concept_id(col)

            if col in keys:
                if col in ent_keys:
                    t = 'entity_domain'
                else:
                    t = 'time'
            else:
                t = 'measure'

            concepts_df.loc[concept] = [col, t]

        for ent in ent_keys:
            ent_df = data[[ent]].drop_duplicates().copy()
            ent_concept = to_concept_id(ent)
            ent_df.columns = ['name']
            ent_df[ent_concept] = ent_df.name.map(to_concept_id)

            if ent_concept not in all_entities.keys():
                all_entities[ent_concept] = ent_df
            else:
                all_entities[ent_concept] = pd.concat([all_entities[ent_concept], ent_df],
                                                      ignore_index=True)

        data = data.set_index(keys)
        for c in data:
            # output datapoints
            df = data[c].copy()
            df = df.reset_index()
            for k in keys[:-1]:
                df[k] = df[k].map(to_concept_id)
            df.columns = df.columns.map(to_concept_id)
            (df.dropna()
               .to_csv(join(out_path,
                            'ddf--datapoints--{}--by--{}.csv'.format(
                                to_concept_id(c), '--'.join(keys_alphanum))),
                       index=False))

    # output concepts
    concepts_df.to_csv(join(out_path, 'ddf--concepts.csv'))

    # output entities
    for c, df in all_entities.items():
        df.to_csv(join(out_path, 'ddf--entities--{}.csv'.format(c)), index=False)

    dp = get_datapackage(out_path, use_existing=False)
    dump_json(os.path.join(out_path, 'datapackage.json'), dp)

    return
Beispiel #38
0
def conceptID(str):
    return to_concept_id(camelToSnake(str))
Beispiel #39
0
    concs = data['Element'].unique()
    cdf = pd.DataFrame([], columns=['concept', 'name', 'concept_type'])
    cdf['name'] = ['Name', 'Country', 'Item', 'Year', *concs]
    cdf['concept'] = cdf['name'].map(to_concept_id)
    cdf.concept_type = 'measure'

    cdf.loc[0, 'concept_type'] = 'string'
    cdf.loc[1, 'concept_type'] = 'entity_domain'
    cdf.loc[2, 'concept_type'] = 'entity_domain'
    cdf.loc[3, 'concept_type'] = 'time'

    cdf.to_csv(os.path.join(out_path, 'ddf--concepts.csv'), index=False)

    # datapoints
    data_ = data[['Country Code', 'Item Code', 'Element', 'Year Code', 'Value']]
    gs = data_.groupby('Element').groups

    for k, idx in gs.items():
        cid = to_concept_id(k)
        df = data_.ix[idx].copy()
        df = df.drop('Element', axis=1)
        df.columns = ['country', 'item', 'year', cid]

        path = os.path.join(
            out_path, 'ddf--datapoints--{}--by--country--item--year.csv').format(cid)
        df.to_csv(path, index=False)

    get_datapackage(out_path, use_existing=True, to_disk=True)

    print('Done.')
Beispiel #40
0
    # entities
    area = data001['Area'].unique()
    area_id = list(map(to_concept_id, area))
    ent = pd.DataFrame([], columns=['area', 'name'])
    ent['area'] = area_id
    ent['name'] = area

    path = os.path.join(out_dir, 'ddf--entities--area.csv')
    ent.to_csv(path, index=False)

    # datapoints
    dps_list = [
        'Life expectancy at birth', 'Life expectancy, with interpolations'
    ]
    dps = dict([(x, to_concept_id(x)) for x in dps_list])

    for col, col_id in dps.items():
        dp = extract_datapoints(data001, col, col_id)
        path = os.path.join(
            out_dir, 'ddf--datapoints--{}--by--area--year.csv'.format(col_id))

        dp.dropna().sort_values(by=['area', 'year']).to_csv(path, index=False)

    # data001_dp_1 = data001[['Area', 'Year', 'Total Fertility Rate (TFR), also called Children per Woman']].copy()
    # data001_dp_2 = data001[['Area', 'Year', 'TFR interpolated']].copy()

    # data001_dp_1.columns = ['area', 'year', 'total_fertility_rate']
    # data001_dp_2.columns = ['area', 'year', 'total_fertility_rate_interpolated']

    # data001_dp_1['area'] = data001_dp_1['area'].map(to_concept_id)
Beispiel #41
0
    # read codebook csv
    cb = pd.read_csv(cb_csv, skiprows=1)
    cb = cb.drop('Variable:', axis=1)  # unneeded column
    # read data csv
    data = pd.read_csv(data_csv)

    # now begins the etl process
    names = list()
    measures = list()

    data['metric'] = data['metric'].map(to_concept_id)

    # each metric will act as a measure, and there are 3 parts of data
    # for each measure
    for met in cb.metric.drop(0).dropna().unique():
        met_id = to_concept_id(met)
        df = data.groupby(by='metric').get_group(met_id)
        for i in ['mean', 'lower', 'upper']:
            # append the measure and measure name lists.
            measure = '{}_{}'.format(met_id, i)
            measures.append(measure)
            if i in ['lower', 'upper']:
                name = '{}, 95% Uncertainty Interval - {} Bound'.format(
                    met, i.title())
            else:
                name = '{}: Mean'.format(met)
            names.append(name)

            # save datapoints
            df = df.rename(columns={i: measure})
            df_out = df[[
# -*- coding: utf-8 -*-

import pandas as pd
import os
from ddf_utils.str import to_concept_id
from ddf_utils.index import create_index_file


source = '../source/gapdata009.xlsx'
out_dir = '../../'

# the datapoint name and datapoint id
dp_name_sheet = 'data & sources'
dp_name = 'Average age at 1st marriage (girls)'
dp_id = to_concept_id(dp_name)

if __name__ == '__main__':

    # reading data
    data001 = pd.read_excel(source, sheetname=dp_name_sheet)

    data001_dp = data001[['Country', 'Year', 'Data']].copy()
    data001_dp.columns = ['country', 'year', dp_id]

    # entities
    country = data001_dp['country'].unique()
    country_id = list(map(to_concept_id, country))
    ent = pd.DataFrame([], columns=['country', 'name'])
    ent['country'] = country_id
    ent['name'] = country
    path = os.path.join(out_dir, 'ddf--entities--country.csv')
def generate_metadata(ddf_concept, graphs, meta2, area, outdir, oneset=False):
    """Generate the metadata.json.

    ddf_concept: ddf--concepts for this repo.
    graphs: graph settings
    meta2: the old metadata.json
    area: area_categorizarion.json
    outdir: the output dir of datapoints.
    oneset: if oneset is true, only one entity set(world_4region) will be added.
    """
    # use OrderedDict in order to keep the order of insertion.
    indb = OrderedDict([['indicatorsDB', OrderedDict()]])

    # rename indicator_url to sourceLink
    ddf_concept = ddf_concept.rename(columns={'indicator_url': 'sourceLink'})

    # convert json fields to dict/list object.
    to_json = lambda x: json.loads(x) if isinstance(x, str) else x
    ddf_concept['scales'] = ddf_concept['scales'].map(to_json)
    ddf_concept['color'] = ddf_concept['color'].map(to_json)

    # geo property
    geo_list = ['geo', 'name', 'latitude', 'longitude',
                'world_4region']
    geo_cols = ['scales', 'sourceLink', 'color']

    ddf_concept = ddf_concept.set_index('concept')
    for k in geo_list:
        values = ddf_concept.loc[[k], geo_cols]
        values.columns = ['scales', 'sourceLink', 'color']
        value_dict = to_dict_dropna(values)
        if k == 'geo':
            key = k
        else:
            key = 'geo.'+k
        indb['indicatorsDB'][key] = value_dict[k]
        indb['indicatorsDB'][key]['use'] = 'property'
        if 'color' in indb['indicatorsDB'][key].keys():
            indb['indicatorsDB'][key]['color'] = indb['indicatorsDB'][key]['color']

    # manually add a _default and time indicator
    indb['indicatorsDB']['time'] = {
        "use": "indicator",
        "scales": ["time"],
        "sourceLink": ""
    }
    indb['indicatorsDB']['_default'] = {
        "use": "constant",
        "scales": ["ordinal"],
        "sourceLink": ""
    }

    if not oneset:
        group_data = ddf_concept[ddf_concept['domain'] == 'geo'][geo_cols]
        group_names = group_data.index
        group_names = group_names.drop(['country', 'world_4region'])

        for g in sorted(group_names):
            value_dict = to_dict_dropna(group_data.ix[[g]])
            key = 'geo.'+g
            indb['indicatorsDB'][key] = value_dict[g]
            indb['indicatorsDB'][key]['use'] = 'property'
            if 'color' in indb['indicatorsDB'][key].keys():
                indb['indicatorsDB'][key]['color'] = indb['indicatorsDB'][key]['color']

    ddf_concept = ddf_concept.reset_index()

    # all measure types.
    measure_cols = ['concept', 'sourceLink', 'scales', 'interpolation', 'color']
    mdata = ddf_concept[ddf_concept['concept_type'] == 'measure'][measure_cols]
    mdata = mdata.set_index('concept')
    mdata = mdata.drop(['longitude', 'latitude'])
    mdata.columns = ['sourceLink', 'scales', 'interpolation', 'color']
    mdata['use'] = 'indicator'

    mdata_dict = to_dict_dropna(mdata)
    for k in sorted(mdata_dict.keys()):
        indb['indicatorsDB'][k] = mdata_dict.get(k)

    for i in indb['indicatorsDB'].keys():
        fname = os.path.join(outdir, 'ddf--datapoints--'+i+'--by--geo--time.csv')
        try:
            df = pd.read_csv(fname, dtype={i: float, 'time': int})
        except (OSError, IOError):
            print('no datapoints for ', i)
            continue

        # domain and availability
        dm = [float(df[i].min()), float(df[i].max())]
        av = [int(df['time'].min()), int(df['time'].max())]

        # make it zero when the number is too small
        if np.abs(dm[0]) < 1e-5:
            dm[0] = 0
        if np.abs(av[0]) < 1e-5:
            av[0] = 0

        # domain_quantiles_10_90:
        # 1) sort by indicator value
        # 2) remove top and bottom 10% of values (os if 100 points, remove 10 from top and bottom)
        # 3) take first and last value of what's left as min and max in the property above.
        values_sorted = df[i].sort_values().values
        q_10 = int(np.round(len(values_sorted) / 10))
        q_90 = -1 * q_10 - 1

        # values_sorted = values_sorted[q_10:q_90]
        # domain_quantiles_10_90 = [values_sorted.min(), values_sorted.max()]

        domain_quantiles_10_90 = [values_sorted[q_10], values_sorted[q_90]]

        indb['indicatorsDB'][i].update({
            'domain': dm, 'availability': av,
            'domain_quantiles_10_90': domain_quantiles_10_90
        })

    # newdb = OrderedDict([[key, indb['indicatorsDB'][key]] for key in sorted(indb['indicatorsDB'].keys())])
    # indb['indicatorsDB'] = newdb

    # indicator Trees
    indb['indicatorsTree'] = OrderedDict([['id', '_root'], ['children', []]])
    ti = OrderedDict([['id', 'time']])
    pro = OrderedDict([['id', '_properties'], ['children', [{'id': 'geo'}, {'id': 'geo.name'}]]])

    indb['indicatorsTree']['children'].append(ti)
    all_levels = graphs[['concept', 'menu_level1', 'menu_level_2']].sort_values(['menu_level1', 'menu_level_2'], na_position='first')

    # change nans to something more convenient
    all_levels['menu_level1'] = all_levels['menu_level1'].apply(to_concept_id).fillna('0')
    all_levels['menu_level_2'] = all_levels['menu_level_2'].apply(to_concept_id).fillna('1')

    all_levels = all_levels.set_index('concept')

    g = all_levels.groupby('menu_level1').groups

    ks = list(sorted(g.keys()))

    # move 'for_advanced_user' to the end of list.
    if 'for_advanced_users' in ks:
        ks.remove('for_advanced_users')
        ks.append('for_advanced_users')

    # loop though all level1 keys, for each key:
    # if key is nan, insert to the _root tree with {'id': concept_name}
    # else, insert {'id': concept_name, 'children': []}
    # then group all concepts with the key as level 1 menu by level 2 menu
    # loop though each level 2 group and do the same insertion logic as above.
    for key in ks:
        if key == '0':  # so it's NaN
            for i in sorted(g[key]):
                indb['indicatorsTree']['children'].append({'id': i})
            # insert _properities entity after the root level menus as requested.
            indb['indicatorsTree']['children'].append(pro)
            if not oneset:
                for i in range(len(area)):
                    key = 'geo.'+to_concept_id(area[i]['n'])
                    # remove geo.geographic_regions_in_4_colors as requested
                    # by Jasper
                    if key == 'geo.geographic_regions_in_4_colors':
                        continue
                    indb['indicatorsTree']['children'][-1]['children'].append({'id': key})
            indb['indicatorsTree']['children'][-1]['children'].append({'id': 'geo.world_4region'})
            continue

        od = OrderedDict([['id', key], ['children', []]])
        indb['indicatorsTree']['children'].append(od)

        g2 = all_levels.ix[g[key]].groupby('menu_level_2').groups
        for key2 in sorted(g2.keys()):
            if key2 == '1':  # it's NaN
                for i in sorted(g2[key2]):
                    indb['indicatorsTree']['children'][-1]['children'].append({'id': i})
            else:
                od = OrderedDict([['id', key2], ['children', []]])
                indb['indicatorsTree']['children'][-1]['children'].append(od)
                for i in sorted(g2[key2]):
                    indb['indicatorsTree']['children'][-1]['children'][-1]['children'].append({'id': i})

    return indb
Beispiel #44
0
def cleanup_data(source_dir):
    all_data = []

    for f in os.listdir(source_dir):
        if 'xls' in f:
            if '2014' in f:
                data = pd.read_excel(os.path.join(source_dir, f), skiprows=1, sheetname='CPI 2014')
                data = data[['Country / Territory', 'Unnamed: 2', 'CPI 2014 Score']]  ## unnamed 2 is wbcode
                data['year'] = 2014
                data = data.dropna()
                data.columns = ['country', 'wbcode', 'cpi', 'year']
                all_data.append(data)
            if '2015' in f:
                data = pd.read_excel(os.path.join(source_dir, f), sheetname='CPI 2015')
                data = data[['Country', 'wbcode', 'CPI2015']]
                data['year'] = 2015
                data = data.dropna()
                data.columns = ['country', 'wbcode', 'cpi', 'year']
                all_data.append(data)
            if '2010' in f:
                data = pd.read_excel(os.path.join(source_dir, f), skiprows=1, sheetname='CPI table')
                data = data[['Country / Territory', 'CPI 2010 Score']]
                data['year'] = 2010
                data = data.drop([0, 1]).dropna()
                data.columns = ['country', 'cpi', 'year']
                all_data.append(data)
            if '2011' in f:
                data = pd.read_excel(os.path.join(source_dir, f), sheetname='Global')
                data = data[['Country / Territory', 'CPI 2011 Score']]
                data['year'] = 2011
                data = data.dropna()
                data.columns = ['country', 'cpi', 'year']
                all_data.append(data)
            if '2012' in f:
                data = pd.read_excel(os.path.join(source_dir, f), sheetname='CPI 2012')
                data = data[['Country / Territory', 'CPI 2012 Score']]
                data['year'] = 2012
                data = data.dropna()
                data.columns = ['country', 'cpi', 'year']
                all_data.append(data)
            if '2013' in f:
                data = pd.read_excel(os.path.join(source_dir, f), sheetname='CPI 2013')
                data = data[['Country / Territory', 'Unnamed: 2', 'CPI 2013 Score']]  ## unnamed 2 is wbcode
                data['year'] = 2013
                data = data.dropna()
                data.columns = ['country', 'wbcode', 'cpi', 'year']
                all_data.append(data)

    # concat all data and fill in wbcode column.
    all_data_df = pd.concat(all_data)
    all_data_df = all_data_df.reset_index(drop=True)
    all_data_df.country = all_data_df.country.str.strip()
    gps = all_data_df.groupby('country')
    for k, v in gps.groups.items():
        df = all_data_df[all_data_df['country'] == k]

        wbcode_list = df['wbcode'].unique()
        wbcode_list = [x for x in wbcode_list if x is not np.nan]

        if len(wbcode_list) == 0:
            country = to_concept_id(k)

        else:
            country = wbcode_list[0]

        all_data_df.loc[v, 'wbcode'] = country

    return all_data_df