Esempio n. 1
0
def processMultiCountryFile(config):
    us.log(config)
    xlFile = getXLFile(config)
    multiIndex = excelToMultiIndex(xlFile, header=[1,2])
    cols = getTopCols(multiIndex)
    gen1Path = gen_1_dir + config["path"]
    us.mkdirs(gen1Path)
    prefix = us.format_lower_no_spaces(config["descriptor"])
    countryMap = {}
    for country in cols:
        lcountry = us.format_lower_no_spaces(country)
        us.log(country)
        if config["timeseries"]:
            try:
                (monthly, yearly, meta) = getTimeseriesDataSet(multiIndex, country, config)
                if len(monthly.index) and len(yearly.index) == 0:
                    yearly = build_annual_from_monthly(monthly, country)
                    
                countryMap[lcountry] = (monthly, yearly, meta)
                if len(yearly.index):
                    writeGen1XLS(yearly, meta, gen1Path, lcountry, prefix, "annual")
                if len(monthly.index):
                    writeGen1XLS(monthly, meta, gen1Path, lcountry, prefix, "monthly")
            except ValueError:
                print("********************** Unable to convert data for " + country)
                continue
        else:
            (data, meta) = getDataSet(multiIndex, country, config)
            countryMap[lcountry] = (data, meta)
            writeGen1XLS(data, meta, gen1Path, lcountry, prefix)
    return countryMap
Esempio n. 2
0
def get_data_by_indicator(id_indicator):
    dimensions = get_dimensions(
        id_indicator, us.load_carib_country_dict(key_column="cepalid"))
    url = build_url(
        "getDataMeta", "idIndicator=" + id_indicator + "&" +
        build_dimension_options(dimensions))
    us.log(url)
    us.log(dimensions)
    response = urllib.request.urlopen(url)
    xml = response.read()
    tree = et.fromstring(xml)
    datos = tree.findall("datos")[0].findall("dato")
    data = []
    for dat in datos:
        row = []
        for dim in dimensions.keys():
            key = dat.attrib["dim_" + dim]
            row.append(dimensions[dim]["labels"][key])
            #print(dimensions[key]["name"])
        row.append(dat.attrib["valor"])
        row.append(dat.attrib["id_fuente"])
        row.append(dat.attrib["ids_notas"])
        row.append(dat.attrib["iso3"])
        data.append(row)
    cols = list(map(lambda k: dimensions[k]["name"], list(
        dimensions.keys()))) + ["Value", "Source", "Notes", "ISO3"]
    cols[cols.index("Countries")] = "Country"
    cols[cols.index("Years")] = "Year"
    df = pd.DataFrame(data, columns=cols)
    newCols = ["Year", "Value", "Source", "Notes", "ISO3"]
    for key in cols:
        if not key in newCols:  # This is O^2
            newCols.append(key)
    df = df.reindex_axis(newCols, axis=1)
    return (df, get_metadata_from_tree(tree))
Esempio n. 3
0
def get_data_by_indicator(id_indicator):	
    dimensions = get_dimensions(id_indicator, us.load_carib_country_dict(key_column="cepalid"))
    url = build_url("getDataMeta","idIndicator=" + id_indicator + "&" + build_dimension_options(dimensions))
    us.log(url)
    us.log(dimensions)
    response = urllib.request.urlopen(url)
    xml = response.read()
    tree = et.fromstring(xml)
    datos = tree.findall("datos")[0].findall("dato")
    data = []
    for dat in datos:
        row = []
        for dim in dimensions.keys():
            key = dat.attrib["dim_" + dim]
            row.append(dimensions[dim]["labels"][key])
            #print(dimensions[key]["name"])
        row.append(dat.attrib["valor"])
        row.append(dat.attrib["id_fuente"])
        row.append(dat.attrib["ids_notas"])
        row.append(dat.attrib["iso3"])
        data.append(row)
    cols = list(map(lambda k : dimensions[k]["name"], list(dimensions.keys())))+ ["Value","Source","Notes","ISO3"]
    cols[cols.index("Countries")] = "Country"
    cols[cols.index("Years")] = "Year"
    df = pd.DataFrame(data, columns=cols)
    newCols = ["Year","Value","Source","Notes","ISO3"]
    for key in cols:
        if not key in newCols: # This is O^2
            newCols.append(key)
    df = df.reindex_axis(newCols, axis=1)
    return (df, get_metadata_from_tree(tree))
Esempio n. 4
0
def build(config):
    statfile = config["gen_1_dir"] + config["indicator_id"] + "_all.csv"
    df = pd.read_csv(statfile, encoding="utf-8", index_col=["ISO3"], dtype={'Notes':'object'} )
    metafile = config["gen_1_dir"] + config["indicator_id"] + "_meta.csv"
    metamap = parse_meta_file(metafile)
    metamap.update(config) # this enables customizations in config to override entries in the meta file
    report = build_files(df, metamap)
    us.log("%i series saved to %s" % (len(report), config["gen_2_dir"]))
    return report
Esempio n. 5
0
def build_files(df, config):
    filelist = []
    countrylist = []
    for iso3 in us.get_index_set(df):
        try:
            idf = df.ix[iso3]
            if (
                type(idf) == pd.Series
            ):  # idf a Series if there is only one element in it, but we want a DataFrame always
                idf = pd.DataFrame([idf])
            idf = idf[["Year", "Value", "Source", "Notes"]]
            idf.columns = ["year", "value", "source", "note"]
            mult = config["multiplier"]
            if mult:
                if (mult <= 1 and mult >= -1) or not type(mult) is int:
                    idf["value"] = idf["value"].apply(lambda x: x * mult)
                else:
                    idf["value"] = idf["value"].apply(lambda x: int(x * mult)).astype(object)
            idf["source"] = idf["source"].apply(lambda x: config["source"])
            idf["note"] = idf["note"].apply(lambda x: get_notes(str(x), config))
            filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"]
            filename = filestem + ".csv"
            filepath = config["gen_2_dir"] + filename
            us.log(filepath)
            idf.to_csv(filepath, encoding="utf8", index=False)

            country = us.get_country_by_iso3(iso3)
            meta = [
                ("name", "%s - %s [CEPALStat]" % (country, config["indicator"])),
                ("originalsource", config["source"]),
                ("proximatesource", "CEPALStat"),
                ("dataset", config["indicator"] + " [" + config["indicator_id"] + "]"),
                ("description", config["definition"]),
                ("category", config["indicator_category"]),
                ("type", config["indicator_type"]),
                ("file", filename),
                ("filehash", us.githash(filepath)),
                ("columns", "year,value,source,notes"),
            ]

            metafile = config["gen_2_dir"] + filestem + "_meta.csv"
            pd.DataFrame(meta, columns=["key", "value"]).to_csv(
                metafile, encoding="utf8", float_format="%.3f", index=False
            )
            filelist.append([filestem])
            countrylist.append(country)
        except Exception as strerror:
            us.log("ERROR: Failed to build data for %s" % iso3)
            us.log(sys.exc_info())
            traceback.print_tb(sys.exc_info()[2])

    fldf = pd.DataFrame(filelist, index=countrylist).sort_index()
    fldf.to_csv(
        config["gen_2_dir"] + "_" + config["prefix"] + ".csv",
        encoding="utf8",
        float_format="%.1f",
        index=False,
        header=False,
    )
    return fldf
Esempio n. 6
0
def build_files(df, config):
    filelist = []
    countrylist = []
    for iso3 in us.get_index_set(df):
        try:
            idf = df.ix[iso3]
            if type(idf) == pd.Series: #idf a Series if there is only one element in it, but we want a DataFrame always
                idf = pd.DataFrame([idf])
            idf = idf[["Year","Value","Source","Notes"]]
            idf.columns = ["year","value","source","notes"]
            mult = config["multiplier"]
            if mult:
                if (mult <= 1 and mult >= -1) or not type(mult) is int:
                    idf["value"] = idf["value"].apply(lambda x : x * mult)
                else:
                    idf["value"] = idf["value"].apply(lambda x : int(x * mult)).astype(object)
            idf["source"] = idf["source"].apply(lambda x : config["source"])
            idf["notes"] = idf["notes"].apply(lambda x : get_notes(str(x), config))
            filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"]
            filename = filestem + ".csv"
            filepath = config["gen_2_dir"] + filename
            us.log(filepath)
            idf.to_csv(filepath, encoding="utf8", index=False)
                   
            country = us.get_country_by_iso3(iso3)    
            meta = [("name", "%s - %s [CEPALStat]" % (country, config["indicator"])),
                ("originalsource", config["source"]),
                ("proximatesource", "CEPALStat"),
                ("dataset", config["indicator"] + " [" + config["indicator_id"] + "]"),
                ("description", config["definition"]),
                ("category", config["indicator_category"]),
                ("type", config["indicator_type"]),
                ("file", filename),
                ("filehash", us.githash(filepath)),
                ("columns", "year,value,source,notes")
                ]
     
            metafile = config["gen_2_dir"] + filestem + "_meta.csv"    
            pd.DataFrame(meta,columns = ["key","value"]).to_csv(metafile, encoding="utf8", float_format='%.3f',index=False)
            filelist.append([filestem])
            countrylist.append(country)
        except Exception as strerror:
            us.log("ERROR: Failed to build data for %s" % iso3)
            us.log(sys.exc_info())
            traceback.print_tb(sys.exc_info()[2])
            
    fldf = pd.DataFrame(filelist, index=countrylist).sort_index()
    fldf.to_csv(config["gen_2_dir"] + "_" + config["prefix"] + ".csv", encoding="utf8", float_format='%.1f', index=False, header=False)
    return fldf
Esempio n. 7
0
dataset[0][2]  # This is what the metadata looks like

# <markdowncell>

# Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database.

# <markdowncell>

# This code follows a fairly standard form, in which it loops through each country's information, writes the data to a .csv file, extracts some metadata and writes that to a .csv file, and then writes a list of all the files that it made to a file called \_PREFIX.csv

# <codecell>

filelist = []
us.mkdirs(config["gen_2_dir"])
for (iso3, df, mf) in dataset:
    us.log(iso3)
    try:
        df.columns = ["description", "value"]
    except Exception:  # The data for St Kitts only has one column. We are excluding it for now.
        us.log(sys.exc_info())
        continue
    filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"]
    filename = filestem + ".csv"
    filepath = config["gen_2_dir"] + filename
    df.to_csv(filepath, encoding="utf8", float_format='%.3f', index=False)

    country = us.get_country_by_iso3(iso3)
    meta = [("name", "%s - %s [SIDS RCM]" % (country, config["indicator"])),
            ("originalsource", mf.ix["Capacity"]["Source"]),
            ("proximatesource", "SIDS RCM"), ("dataset", config["indicator"]),
            ("description", config["description"]),
# <markdowncell>

# Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database.

# <markdowncell>

# This code follows a fairly standard form, in which it loops through each country's information, writes the data to a .csv file, extracts some metadata and writes that to a .csv file, and then writes a list of all the files that it made to a file called \_PREFIX.csv

# <codecell>

keycol = "Total"

filelist = []
us.mkdirs(config["gen_2_dir"])
for (iso3, df, mf) in dataset:
    us.log(iso3)
    try:
        if len(df.columns) == 2:
            df.columns = ["year", "value"]
        else:
            df.columns = ["year", "value", "notes"]
        
    except Exception: # The data for St Kitts only has one column. We are excluding it for now.
        us.log(sys.exc_info())
        continue
    filestem = config["prefix"] + "_" + iso3.lower() + "_" + config["suffix"]
    filename = filestem + ".csv"
    filepath = config["gen_2_dir"] + filename
    df.to_csv(filepath, encoding="utf8", float_format='%.3f', index=False)
    
    country = us.get_country_by_iso3(iso3)
# Generation 1 - Downloads XML through the CEPALStat web service and dumps the enclosed data to a csv file.

# <codecell>

gen1_cepalstat.download(config)

# <markdowncell>

# Generation 2 - Refines the rough csv data from Generation 1 into a standardized csv format common to all data sets. Prepares this data for importing to the database.

# <codecell>

gen2_cepalstat.build(config)
data_list = gen3_utils.get_gen2_data(config)
us.log(data_list[0:3]) # have a look at a sample of the data

# <markdowncell>

# Generation 3 - Load the Generation 2 data into the database.

# <codecell>

gen3_utils.standard_load_from_data_list(data_list)

# <markdowncell>

# Generation 4 - Build a php file that uses javascript to generate SVGs using the data in the database

# <codecell>
Esempio n. 10
0
def standard_load_from_data_list(data_list, observation_fields="series, locationid, dateid, value, source, note"):
    with us.get_db_connection() as cursor:
        for (series_key, iso3, meta, data) in data_list:
            cursor.execute("SELECT ID FROM location WHERE iso3 = %s AND divisionname IS NULL AND city IS NULL", [iso3])
            locationid = list(cursor)[0][0]
            
            cursor.execute("DELETE FROM series WHERE identifier = %s", series_key)
            cursor.execute("DELETE FROM observation WHERE series = %s", series_key)
            
            series_fields = "identifier, dataset, file, filehash, category, type, name, " + \
                            "description, originalsource, proximatesource, originalsourcelink, proximatesourcelink"
            insert_series = "INSERT INTO series (" + series_fields +") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" 
            values = [series_key]
            for field in series_fields.split(",")[1:]:
                field = field.strip()
                if field in meta:
                    if pd.isnull(meta[field]):
                        values.append(None)
                    else:
                        values.append(meta[field])
                else:
                    values.append(None)
            cursor.execute(insert_series, values)

            for indx in data.index:
                record = data.ix[indx];
                values = []
                valplaces = []
                for field in observation_fields.split(","):
                    valplaces.append("%s")
                    field = field.strip()
                    if field == "dateid": #TODO: this needs to be revisited; it assumes that "year" is the only date value
                        cursor.execute("SELECT ID FROM date WHERE year = %s AND month IS NULL AND day IS NULL", [str(record["year"])])
                        dateid = list(cursor)[0][0]        
                        values.append(dateid)      
                    
                    if field == "description":
                        values.append(get_translated_field(data, record, "description"))
                
                    if field == "locationid": #TODO: this needs to be revisited
                        values.append(locationid)   
                        
                    if field == "note":        
                        values.append(get_translated_field(data, record, "note"))
                
                    if field == "series":
                        values.append(series_key)
                        
                    if field == "source":               
                        values.append(get_translated_field(data, record, "source"))     
                
                    if field == "value":
                        values.append(str(record["value"]).strip())
                                
                insert_observation = "INSERT INTO observation(" + observation_fields + ") VALUES (" + ", ".join(valplaces) + ")"
                # this logging can be very chatty
                if verbose:
                    us.log(insert_observation)
                    us.log(values)
                cursor.execute(insert_observation, values)
            cursor.execute("COMMIT")
Esempio n. 11
0
def standard_load_from_data_list(
        data_list,
        observation_fields="series, locationid, dateid, value, source, note"):
    with us.get_db_connection() as cursor:
        for (series_key, iso3, meta, data) in data_list:
            cursor.execute(
                "SELECT ID FROM location WHERE iso3 = %s AND divisionname IS NULL AND city IS NULL",
                [iso3])
            locationid = list(cursor)[0][0]

            cursor.execute("DELETE FROM series WHERE identifier = %s",
                           series_key)
            cursor.execute("DELETE FROM observation WHERE series = %s",
                           series_key)

            series_fields = "identifier, dataset, file, filehash, category, type, name, " + \
                            "description, originalsource, proximatesource, originalsourcelink, proximatesourcelink"
            insert_series = "INSERT INTO series (" + series_fields + ") VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            values = [series_key]
            for field in series_fields.split(",")[1:]:
                field = field.strip()
                if field in meta:
                    if pd.isnull(meta[field]):
                        values.append(None)
                    else:
                        values.append(meta[field])
                else:
                    values.append(None)
            cursor.execute(insert_series, values)

            for indx in data.index:
                record = data.ix[indx]
                values = []
                valplaces = []
                for field in observation_fields.split(","):
                    valplaces.append("%s")
                    field = field.strip()
                    if field == "dateid":  #TODO: this needs to be revisited; it assumes that "year" is the only date value
                        cursor.execute(
                            "SELECT ID FROM date WHERE year = %s AND month IS NULL AND day IS NULL",
                            [str(record["year"])])
                        dateid = list(cursor)[0][0]
                        values.append(dateid)

                    if field == "description":
                        values.append(
                            get_translated_field(data, record, "description"))

                    if field == "locationid":  #TODO: this needs to be revisited
                        values.append(locationid)

                    if field == "note":
                        values.append(
                            get_translated_field(data, record, "note"))

                    if field == "series":
                        values.append(series_key)

                    if field == "source":
                        values.append(
                            get_translated_field(data, record, "source"))

                    if field == "value":
                        values.append(str(record["value"]).strip())

                insert_observation = "INSERT INTO observation(" + observation_fields + ") VALUES (" + ", ".join(
                    valplaces) + ")"
                # this logging can be very chatty
                if verbose:
                    us.log(insert_observation)
                    us.log(values)
                cursor.execute(insert_observation, values)
            cursor.execute("COMMIT")