def parse_io(): # we'll just parse the same file twice, # once each for make/use for (agglevel, (intermediate, finaldemand)) in io_tables.items(): colnames = ["year", "industry", "commodity", "value"] if agglevel == "detail": coltypes = ["int", "varchar(15)", "varchar(15)", "float"] colfuncs = { "industry": get_industry_code, "commodity": get_commodity_code, } else: coltypes = ["int", "varchar(255)", "varchar(255)", "float"] colfuncs = { "industry": strip_millions, "commodity": strip_millions} # parse intermediate filename = "%s-eng.csv" % intermediate filepath = fileutils.getcache(filename, "ca") io_col_map["industry"] = "IND" if agglevel == "detail": io_col_map["commodity"] = "COMMOD" else: io_col_map["commodity"] = "COMM" csvtable = CSVTable(filepath, True, "cp1252") tablename = "%s.io_make_%s" % (config.SCHEMA, agglevel) csvtable.create_sql_table(tablename, colnames, coltypes) csvtable.parse_to_sql(io_col_map, colfuncs, skip_make, cascade=True) # we can reuse CSVTable for the same source file tablename = "%s.io_use_%s" % (config.SCHEMA, agglevel) csvtable.create_sql_table(tablename, colnames, coltypes) csvtable.parse_to_sql(io_col_map, colfuncs, skip_use) # parse final demand filename = "%s-eng.csv" % finaldemand filepath = fileutils.getcache(filename, "ca") io_col_map["commodity"] = "COMM" io_col_map["industry"] = "CAT" fdtable = CSVTable(filepath, True, "cp1252") tablename = "%s.io_fd_%s" % (config.SCHEMA, agglevel) fdtable.create_sql_table(tablename, colnames, coltypes) if agglevel == "detail": colfuncs["industry"] = get_fd_industry_code fdtable.parse_to_sql(io_col_map, colfuncs, skip_finaldemand)
def parse_tables(): files = fileutils.getcachecontents("io-annual") for filename in files: path = fileutils.getcache(filename, "io-annual") print(path) table = CSVTable(path, False) make_year = is_make(filename) use_year = is_use(filename) if make_year: table.create_sql_table( "%s.annual_make_%s" % (config.IO_SCHEMA, make_year), ["industry", "commodity", "value"], ["varchar(6)", "varchar(6)", "float"]) elif use_year: table.create_sql_table( "%s.annual_use_%s" % (config.IO_SCHEMA, use_year), ["commodity", "industry", "value"], ["varchar(6)", "varchar(6)", "float"]) elif filename == "codes.csv": table.create_sql_table( "%s.annual_codes" % config.IO_SCHEMA, ["code", "description"], ["varchar(6)", "text"]) else: continue table.parse_to_sql()
def parse_io(): # choose 中分類 for all io tables. # 中分類 for 1990 and 1995 don't break down the electronic # sectors as far as i would like, so use 小分類 files = { 1990: "l00_21.xls", 1995: "l00_21.xls", 2000: "io00a301.xls", 2005: "io05a301.xls", } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): # 1995 and 2000 io tables: easiest tables.add_io_table(year) codes = tables.new_sector_codes(year) # for 1995 use the heisei 2-7-12 file since it has more # harmonized sectors than the standalone 1995 file if year == 1995: sheetindex = 2 else: # the first page of the heisei 2-7-12 file (used for 1990) # happens to be 1990 at nominal prices, matching the others sheetindex = 0 path = fileutils.getcache(filename, "jp", str(year)) wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(sheetindex) ind_names = None ind_codes = None for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if cell == 1: ind_codes = [str(c).strip().rjust(3, "0") for c in row] break if cell.strip() == "001": ind_codes = row break elif ind_names is None: ind_names = row temp_codes = [None, None] for i in range(2, len(row)): temp_codes.append(codes.set_code(ind_codes[i], row[i])) ind_codes = temp_codes else: from_code = row[0] if type(from_code) is float: from_code = str(int(from_code)).rjust(3, "0") from_code = codes.set_code(from_code, row[1]) if from_code: for i in range(2, len(row)): to_code = ind_codes[i] value = row[i] tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_env(): filename = "rftghgemissions.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) sheets = wb.sheets() tables = HybridTableCreator(config.SCHEMA) codes = tables.new_sector_codes(prefix="env_ind") codes.add_curated_codes({ "Manufacture of petrochemicals": "20.1[467]+20.6", "Manufacture of other basic metals & casting (excl. Nuclear fuel & Aluminium)": "24.4[^26]-5", "Rest of repair; Installation": "33.1[^56]", }) for sheet in sheets: series = sheet.name years = None for i in range(sheet.nrows): row = sheet.row_values(i) if len(row) < 3 or type(row[2]) is str and not len(row[2]): continue if years is None: if type(row[2]) is float: years = row for year in row[2:]: #envtable.add_env_table("env", year) tables.add_env_table(year) else: code = codes.set_code(row[0], row[1]) if code: for i in range(2, len(row)): tables.insert_env(years[i], code, series, row[i]) codes.update_codes()
def parse_measurement(filename, measurement, tracker): filepath = fileutils.getcache(filename) with open(filepath) as f: csvf = csv.reader(f) header = next(csvf) for stryear in header[2:]: year = int(stryear) if year not in data: data[year] = {} for row in csvf: if len(row) == len(header): if row[0] == "US": msn = row[1][:4] for i in range(2, len(row)): year = int(header[i]) value = row[i].strip() if len(value): if msn not in data[year]: data[year][msn] = {measurement: value} else: data[year][msn][measurement] = value source = msn[0:2] sector = msn[2:4] insert_values = [year, source, sector, float(value)] if measurement == "price": tracker.insert_row(pricetable, insert_values) elif measurement == "use_btu": tracker.insert_row(usetable, insert_values)
def parse_io(): tables = HybridTableCreator(config.SCHEMA) codes = tables.new_sector_codes(prefix="ind") codes.add_curated_codes(config.curated_sectors) codes.blacklist_code("Differences between totals and sums of components are due to rounding") filename = "bb09-su-tables-1992-2003.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(1992, 2004): parse_ixi_year(tables, codes, wb, year) filename = "input-output-supply-and-use-tables--2004-2008.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(2004, 2009): parse_ixi_year(tables, codes, wb, year) codes.update_codes()
def parse_env(): tables = {} for year in config.STUDY_YEARS: tablename = "%s.env_%d" % (config.WIOD_SCHEMA, year) colnames = ["country", "industry", "measurement", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(31)", "float"] tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() countries = sorted(config.countries.keys()) countries.append("ROW") # rest of world for (series, attribs) in config.env_series.items(): if "dir" in attribs: subdir = attribs["dir"] else: subdir = series subdir = os.path.join("wiod", subdir) skip_name = "skip_name" in attribs and attribs["skip_name"] for country in config.countries.keys(): filename = "%s_%s_May12.xls" % (country, series) print(filename) path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) for year in config.STUDY_YEARS: sheet = wb.sheet_by_name("%d" % year) measurements = sheet.row_values(0) if series == "EU": measurements = [m + " - Gross" for m in measurements] elif series == "CO2": measurements = ["CO2 - " + m for m in measurements] for i in range(1, sheet.nrows): row = sheet.row_values(i) if len(row[0].strip()): if skip_name: ind_code = row[0] first_col = 1 else: ind_name = row[0] ind_code = row[1] industry_tracker.set_code(ind_code, ind_name) first_col = 2 for j in range(first_col, len(row)): value = row[j] if type(value) is float and value != 0: measurement = measurements[j] tables[year].insert( [country, ind_code, measurement, value])
def doparse(): country_dict = dict((v, k) for k, v in config.countries.items()) country_dict["Slovakia"] = "SVK" sources = ["total", "nuclear", "thermal", "renewable", "geothermal", "solar", "wind", "biomass"] measurements = ["capacity", "consumption"] tablename = "%s.world_power" % ("eia") table = SQLTable( tablename, ["year", "country", "source", "units", "value"], ["int", "char(3)", "varchar(15)", "varchar(4)", "float"]) table.create() table.truncate() for source in sources: for measure in measurements: if measure == "consumption": if source in ("geothermal", "solar", "wind", "biomass"): continue units = "bkWh" elif measure == "capacity": units = "MkW" filename = source + "_" + measure + ".xls" path = fileutils.getcache(filename, "eia") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = None for i in range(sheet.nrows): row = sheet.row_values(i) if header is None: if len(row) > 2 and type(row[2]) is float: header = [] for cell in row: if type(cell) is float: header.append(int(cell)) else: header.append(None) header_len = len(header) elif len(row) > 2: country_name = row[0] if country_name in country_dict: country = country_dict[country_name] for i in range(2, header_len): value = row[i] year = header[i] if type(value) is float and value > 0: table.insert( [year, country, source, units, value])
def parse_io(): tables = HybridTableCreator(config.SCHEMA) codes = tables.new_sector_codes(prefix="ind") codes.add_curated_codes(config.curated_sectors) codes.blacklist_code( "Differences between totals and sums of components are due to rounding" ) filename = "bb09-su-tables-1992-2003.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(1992, 2004): parse_ixi_year(tables, codes, wb, year) filename = "input-output-supply-and-use-tables--2004-2008.xls" path = fileutils.getcache(filename, "uk") wb = xlrd.open_workbook(path) for year in range(2004, 2009): parse_ixi_year(tables, codes, wb, year) codes.update_codes()
def create_simple_make_use(self, year, filename, factor=1): self.create_make_table(year) self.create_use_table(year, has_margins=False) with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) == 4: input_ind = cols[0] # comm consumed (producing ind) output_ind = cols[1] # consuming ind (comm produced) use_dollars = cols[2] # use in producers' prices make_dollars = cols[3] # make in producers' prices self.insert_make(input_ind, output_ind, make_dollars, factor) self.insert_use(commod=input_ind, indus=output_ind, useval=use_dollars, factor=factor)
def parse_codes(): ## manually curated sector map table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA, ["io_code", "env_code", "description"], ["varchar(15)", "varchar(15)", "text"]).create() table.truncate() sector_map = fileutils.getdatapath("sector_map.csv", "wiod") fh = open(sector_map, "r") csvf = csv.reader(fh) header = next(csvf) for row in csvf: io_code = row[0].strip() if not len(io_code): io_code = None env_code = row[1].strip() if not len(env_code): env_code = None desc = row[2].strip() table.insert([io_code, env_code, desc]) ## current exchange rates table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA, ["country", "year", "rate"], ["char(3)", "int", "float"]).create() table.truncate() path = fileutils.getcache("exr_wiod.xls", "wiod") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("EXR") year_list = None for i in range(sheet.nrows): row = sheet.row_values(i) if len(row) < 2: continue if year_list is None: if type(row[0]) is str and row[0].strip() == "Country": year_list = [int(cell.strip("_ ")) for cell in row[2:]] else: if type(row[1]) is str and len(row[1].strip()) == 3: country = row[1] if country == "GER": country = "DEU" for (year, value) in zip(year_list, row[2:]): table.insert([country, year, value])
def parse_env(): for (tablecode, tablespec) in eea_tables.items(): filename = "%s-eng.csv" % tablecode filepath = fileutils.getcache(filename, "ca") csvtable = CSVTable(filepath, True) tablename = "%s.%s" % (config.SCHEMA, tablespec["tablename"]) csvtable.create_sql_table(tablename, ["year", "industry", "value"], ["int", "varchar(255)", "float"]) col_funcs = {"industry": get_industry_code} col_map = tablespec["col_map"] skip_callback = None if "skip_callback" in tablespec: skip_callback = tablespec["skip_callback"] csvtable.parse_to_sql(col_map, col_funcs, skip_callback)
def parse_sut(sheet_name, table_prefix): tables = {} colnames = ["country", "commodity", "industry", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] for year in config.STUDY_YEARS: tablename = "%s_%d" % (table_prefix, year) tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() for country in config.countries.keys(): # TODO: more automated way to get this if country in ("AUS", "DEU", "GBR", "USA"): filename = "%s_SUT_Feb12.xls" % country else: filename = "%s_SUT_Jan12.xls" % country subdir = os.path.join("wiod", "suts") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) # extract supply and use tables at fob prices sheet = wb.sheet_by_name(sheet_name) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) if not len(row[0].strip()): continue year = int(row[0]) if year not in config.STUDY_YEARS: continue com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first tables[year].insert( [country, com_code, ind_code, value])
def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print("%d rows inserted" % insert_count)
def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print ("%d rows inserted" % insert_count)
def parse_io(): io_files = { 1996: "410281134571.xls", 1999: "4102715414971.xls", 2001: "4122111363671.xls", 2004: "611239581071.xls", 2006: "9121414285971.xls", 2007: "1139203871.xls", 2008: "1139204871.xls", 2009: "11229101502.xls", 2010: "1122910141371.xls", } for (year, filename) in io_files.items(): tablename = "%s.io_%d" % (config.SCHEMA, year) # millions are in NTD table = SQLTable(tablename, ["from_sector", "to_sector", "millions"], ["varchar(255)", "varchar(255)", "float"]) table.create() table.truncate() path = fileutils.getcache(filename, "tw/%d" % year) wb = xlrd.open_workbook(path) sheet = wb.sheets()[0] to_codes = sheet.row_values(0) to_names = sheet.row_values(1) for rowindex in range(2, sheet.nrows): row = sheet.row_values(rowindex) from_code = row[0].strip() from_name = row[1].strip() for i in range(2, len(to_names)): to_name = to_names[i].strip() value = row[i] table.insert([from_name, to_name, value]) if year == 2010: strings = { "viewname": "%s.io_view_%d" % (config.SCHEMA, year), "tablename": tablename, "maptable": "%s.sector_map_%d" % (config.SCHEMA, year), "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]), "from_blacklist": sqlhelper.set_repr(config.from_blacklists[year]), } sql = """CREATE OR REPLACE VIEW %(viewname)s AS SELECT from_map.io_sector AS from_sector, to_map.io_sector as to_sector, sum(millions) as millions FROM %(tablename)s io, (SELECT DISTINCT io_sector, io_commod FROM %(maptable)s) from_map, (SELECT DISTINCT io_sector, io_ind FROM %(maptable)s) to_map WHERE io.to_sector NOT IN %(to_blacklist)s AND io.from_sector NOT IN %(from_blacklist)s AND from_map.io_commod = io.from_sector AND to_map.io_ind = io.to_sector GROUP BY from_map.io_sector, to_map.io_sector""" % strings print(sql) db.execute(sql)
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable("cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column+1:], rowdata[max_sector_column+1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append(-abs( abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1]*len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i-1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len(columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
def get_filename(self): filepath = os.path.join(str(self.year), self.filename) return fileutils.getcache(filepath)
def get_overlay_data_location(self): dataname = "%s-overlay.dat" % self.filename return fileutils.getcache(dataname, "gnuplot")
def parse_int(): for year in config.STUDY_YEARS: tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year) colnames = [ "from_country", "to_country", "commodity", "industry", "value" ] coltypes = [ "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float" ] use_table = SQLTable(tablename, colnames, coltypes).create() tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year) colnames = ["country", "industry", "commodity", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] make_table = SQLTable(tablename, colnames, coltypes).create() filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4] subdir = os.path.join("wiod", "intsuts_analytic") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) for country in config.countries.keys(): sheet = wb.sheet_by_name("USE_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) # notes say Use tables are broken down by origin from_country = row[1] # stupid hack so i don't have to change char(3) if from_country == "ZROW": from_country = "RoW" com_code = commodity_tracker.set_code(row[2], row[3]) if not com_code: continue for j in range(4, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first use_table.insert( [from_country, country, com_code, ind_code, value]) sheet = wb.sheet_by_name("SUP_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # industry first make_table.insert([country, ind_code, com_code, value])
def parse_io(): ### for ind x ind tables tables = {} colnames = ["country", "from_ind", "to_ind", "is_import", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "bool", "float"] for year in config.STUDY_YEARS: tablename = "%s.niot_%d" % (config.WIOD_SCHEMA, year) tables[year] = SQLTable(tablename, colnames, coltypes) #.create() tables[year].drop() tables[year].create() tables[year].truncate() va_sectors = set(config.va_sectors.values()) for country in config.countries.keys(): filename = "%s_NIOT_ROW_Apr12.xlsx" % country subdir = os.path.join("wiod", "niot") path = fileutils.getcache(filename, subdir) wb = openpyxl.load_workbook(filename=path, use_iterators=True) for year in config.STUDY_YEARS: imports = {} sheet = wb.get_sheet_by_name("%d" % year) rows = sheet.iter_rows() industry_row = None for row in rows: cell = row[0] if cell.internal_value == "(industry-by-industry)": industry_row = row break row = next(rows) # industry names industry_codes = [] for (code_cell, desc_cell) in zip(industry_row, row): code = code_cell.internal_value desc = desc_cell.internal_value industry_codes.append(industry_tracker.set_code(code, desc)) for row in rows: from_code = None from_desc = None is_import = False for (to_code, value_cell) in zip(industry_codes, row): column = value_cell.column value = value_cell.internal_value # excel columns use letters if column == "A": from_code = value_cell.internal_value elif column == "B": from_desc = value_cell.internal_value elif column == "C": from_code = industry_tracker.set_code( from_code, from_desc) if not from_code: break if type(value) is str and value == "Imports": is_import = True elif (column > "D" or len(column) > 1) \ and to_code and value != 0: tables[year].insert( [country, from_code, to_code, is_import, value]) ### for supply and use tables def parse_sut(sheet_name, table_prefix): tables = {} colnames = ["country", "commodity", "industry", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] for year in config.STUDY_YEARS: tablename = "%s_%d" % (table_prefix, year) tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() for country in config.countries.keys(): # TODO: more automated way to get this if country in ("AUS", "DEU", "GBR", "USA"): filename = "%s_SUT_Feb12.xls" % country else: filename = "%s_SUT_Jan12.xls" % country subdir = os.path.join("wiod", "suts") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) # extract supply and use tables at fob prices sheet = wb.sheet_by_name(sheet_name) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) if not len(row[0].strip()): continue year = int(row[0]) if year not in config.STUDY_YEARS: continue com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first tables[year].insert( [country, com_code, ind_code, value]) # make tables parse_sut("SUP_bas", "%s.make" % config.WIOD_SCHEMA) # use tables parse_sut("USE_bas", "%s.use" % config.WIOD_SCHEMA)
def parse_int(): for year in config.STUDY_YEARS: tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year) colnames = [ "from_country", "to_country", "commodity", "industry", "value"] coltypes = [ "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float"] use_table = SQLTable(tablename, colnames, coltypes).create() tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year) colnames = ["country", "industry", "commodity", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] make_table = SQLTable(tablename, colnames, coltypes).create() filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4] subdir = os.path.join("wiod", "intsuts_analytic") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) for country in config.countries.keys(): sheet = wb.sheet_by_name("USE_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) # notes say Use tables are broken down by origin from_country = row[1] # stupid hack so i don't have to change char(3) if from_country == "ZROW": from_country = "RoW" com_code = commodity_tracker.set_code(row[2], row[3]) if not com_code: continue for j in range(4, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first use_table.insert( [from_country, country, com_code, ind_code, value]) sheet = wb.sheet_by_name("SUP_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # industry first make_table.insert( [country, ind_code, com_code, value])
def doparse(): tablename = "%s.world_supplement" % config.WIOD_SCHEMA table = SQLTable(tablename, ["year", "country", "measurement", "value"], ["int", "char(3)", "varchar(8)", "float"]) table.create() table.truncate() # census data has more complete population counts country_fips = { "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE", "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL", "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN", "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE", "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL", "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX", "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN", "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA", } # this file spec is documented in the xlsx file from the archive path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) country = country_fips[fips] table.insert([year, country, "pop", int(fields[2])]) # worldbank data has some deflator data that imf doesn't worldbank = { "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls", #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls", #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls", "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in config.countries: country = row[1] for (year, value) in zip(header, row[2:]): if type(value) is float and value != 0: table.insert([year, country, indicator, value]) imf_fields = ( "LP", # population "PPPPC", # ppp per capita "NGDPRPC", # gdp per capita in constant prices "NGDP_D", # gdp deflator ) # this is actually a csv file despite what it's called path = fileutils.getcache("WEOApr2012all.xls", "wsupp") with codecs.open(path, "r", "cp1252") as fh: csvf = csv.reader(fh, dialect=csv.excel_tab) header = next(csvf) year_cols = {} valid_year = re.compile("\d{4}") valid_float = re.compile("-*[\d\.,]+") for i in range(len(header)): if header[i] == "ISO": country_col = i elif header[i] == "WEO Subject Code": subject_col = i elif valid_year.match(header[i]): year_cols[int(header[i])] = i elif header[i] == "Estimates Start After": last_year_col = i for row in csvf: if len(row) > subject_col and row[subject_col] in imf_fields: field = row[subject_col] country = row[country_col] if country not in config.countries: continue if valid_year.match(row[last_year_col]): last_year = int(row[last_year_col]) else: # not clear if this means all values are estimated last_year = 9999 for (year, colnum) in year_cols.items(): value = row[colnum] if valid_float.match(value): #and year < last_year: table.insert([ year, country, field, float(value.replace(",", "")) ])
def set_filename(self, filename): path = fileutils.getcache(filename, str(self.year)) self.filename = path
def parse_nipa_data(): test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA db.execute("DROP VIEW IF EXISTS %s" % test_view) # get table for pce category harmonization trailing_pat = re.compile('(.+) \(.*\d.*\)$') nipa_code_map = {} filename = fileutils.getdatapath("nipa_code_map.csv", "usa") fh = open(filename) csvf = csv.reader(fh) for row in csvf: if len(row) == 2: harmonized = row[0] trailing = trailing_pat.match(harmonized) if trailing: harmonized = trailing.group(1) nipa_code_map[row[1]] = harmonized fh.close() # get nipa series codes from underlying detail tables tracker = TableStateTracker() tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA, ["code", "parent", "description"], ["char(7)", "char(7)", "text"], True) number_pat = re.compile('^\d+$') trailing_pat = re.compile('(.+) \(.*\d.*\)$') filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False code_stack = [None] indent_stack = [-1] # the code mapping has been done such that each item is at least at # three levels of disaggregation below the top, i.e. there is always # an ancestor at the second level. we only want to keep track of the # ancestor at the third level (root is zero) # the first level below root has goods and services # the second level has durable goods, nondurable goods, and services. reverse_code_dict = {} second_level_nodes = [] for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 2.4.5U"): is_in_table = True else: if row[0].startswith("Table 2.4.5U"): # we only need to go through one instance of this table break else: if number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() # these are duplicate codes if title.startswith("Market-based PCE"): continue code = row[2] current_indent = len(row[1]) - len(title) while current_indent <= indent_stack[-1]: indent_stack.pop() code_stack.pop() indent_stack.append(current_indent) code_stack.append(code) if len(code_stack) > 1: parent = code_stack[-2] else: parent = None title = title.strip() trailing = trailing_pat.match(title) if trailing: title = trailing.group(1) if len(code_stack) > 4: reverse_code_dict[title] = code_stack[3] else: reverse_code_dict[title] = code tracker.insert_row((code, parent, title)) tracker.flush() fh.close() # table for price deflators tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA, ["year", "gdp", "pce"], ["int", "float", "float"]) filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False data = {} # we need to parse two rows before we can populate years = {} for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 1.1.9"): is_in_table = True else: if row[0].startswith("Table 1.1.9"): # this is seasonally adjusted version of the same table break else: if row[0] == "Line": for i in range(len(row)): if number_pat.match(row[i]): year = int(row[i]) years[year] = i data[year] = {} elif number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() if title == "Gross domestic product": column = "gdp" elif title == "Personal consumption expenditures": column = "pce" else: continue for (year, colindex) in years.items(): data[year][column] = float(row[colindex]) for (year, results) in data.items(): tracker.insert_row([year, results["gdp"], results["pce"]]) tracker.flush() fh.close() # parse pce bridge class IONIPAStateTracker(TableStateTracker): def flush(self): TableStateTracker.flush(self) if self.fh is not None and not self.fh.closed: self.fh.close() def __init__(self): TableStateTracker.__init__(self) self.fh = None self.code_dict = None self.value_columns = [ "prod_val", "rail_margin", "truck_margin", "water_margin", "air_margin", "pipe_margin", "gaspipe_margin", "wholesale_margin", "retail_margin", "purchase_val" ] self.old_style_field_map = { "Producers' Value": "prod_val", "MfgExciseTax": "prod_val", "RailMargin": "rail_margin", "TruckMargin": "truck_margin", "WaterMargin": "water_margin", "AirMargin": "air_margin", "PipeMargin": "pipe_margin", "WholesaleMargin": "wholesale_margin", "WholesaleTax": "wholesale_margin", "RetailMargin": "retail_margin", "RetailSalesTax": "retail_margin", "OtherRetailTax": "retail_margin", "Purchasers' Value": "purchase_val", } def set_filename(self, filename): path = fileutils.getcache(filename, str(self.year)) self.filename = path def set_year(self, year): self.flush() self.year = year tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year) fields = ["pce_code", "commodity"] + self.value_columns types = ["varchar(6)", "varchar(6)"] + \ ["bigint"]*len(self.value_columns) self.create_table(tablename, fields, types) def setup_for_codes(self): self.code_dict = {} def flush_codes(self): if self.code_dict is not None: tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year) self.create_table(tablename, ["pce_code", "nipa_group", "description"], ["varchar(6)", "char(7)", "text"]) for (code, raw_desc) in self.code_dict.items(): desc = raw_desc if desc.endswith('(s.)') or desc.endswith('(d.)'): desc = desc[:-4].strip() elif desc.endswith('(n.d.)'): desc = desc[:-6].strip() if desc in nipa_code_map: desc = nipa_code_map[desc] if desc in reverse_code_dict: nipa_code = reverse_code_dict[desc] else: nipa_code = None #self.current_stmt(code, nipa_code, raw_desc) self.table.insert([code, nipa_code, raw_desc]) self.code_dict = None self.flush() def insert_code_row(self, code, desc): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(code) is float: code = int(code) self.code_dict[str(code)] = desc.strip() def insert_row(self, pce_code, commod, dollar_values, factor=1): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(pce_code) is float: pce_code = int(pce_code) values = [str(pce_code).strip(), commod.strip()] for column in self.value_columns: if column in dollar_values: if factor == 1: values.append(dollar_values[column]) else: values.append(int(float(dollar_values[column]) * factor)) else: values.append(None) #self.current_stmt(*values) self.table.insert(values) def parse_old_style_xls(self, year): self.set_year(year) self.set_filename("%d_PCE_Commodity.xls" % self.year) wb = xlrd.open_workbook(self.filename) # parse pce bridge data sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year) field_indexes = {} pce_code_idx = 0 commod_idx = 2 for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: if "PCE Category" in row: pce_code_idx = row.index("PCE Category") if "Commodity" in row: commod_idx = row.index("Commodity") for i in range(len(row)): xls_col = row[i] if xls_col in self.old_style_field_map: colname = self.old_style_field_map[xls_col] if colname not in field_indexes: field_indexes[colname] = [] field_indexes[colname].append(i) elif len(field_indexes): pce_code = row[pce_code_idx] commod = str(int(row[commod_idx])).rjust(6, "0") values = {} for (field, columns) in field_indexes.items(): # doclumentation says units are in 100,000 dollars # but the orders of magnitude don't match up with # later years if we use 100 components = [int(float(row[column] * 1000)) for column in columns] value = 0 for component in components: value += component values[field] = value self.insert_row(pce_code, commod, values) # parse codes from neighboring worksheet self.setup_for_codes() sheet = wb.sheet_by_name("%d PCE Category Descriptions" % self.year) code_idx = None desc_idx = None for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: codetab = "PCE Category Code" codetab2 = "%s - %d" % (codetab, self.year) if codetab in row or codetab2 in row: if codetab in row: code_idx = row.index(codetab) else: code_idx = row.index(codetab2) desctab = "PCE Category Description - %d" % self.year if desctab in row: desc_idx = row.index(desctab) else: desctab = "PCE Category Description" if desctab in row: desc_idx = row.index(desctab) elif code_idx is not None and desc_idx is not None: code = row[code_idx] desc = str(row[desc_idx]) self.insert_code_row(code, desc) self.flush_codes() def get_file_handle(self, filetype, options={}): if filetype == "txt": self.fh = open(self.filename) return self.fh elif filetype == "csv": self.fh = open(self.filename) if "delim" in options: csvf = csv.reader(self.fh, delimiter=options["delim"]) else: csvf = csv.reader(self.fh) return csvf elif filetype == "xls": wb = xlrd.open_workbook(self.filename) return wb def parse_text(self, rowcallback): path = fileutils.getcache(filename, str(self.year)) f = open(path) for line in f: rowcallback(line, this) f.close() tracker = IONIPAStateTracker() tracker.parse_old_style_xls(1967) tracker.parse_old_style_xls(1972) tracker.parse_old_style_xls(1977) tracker.parse_old_style_xls(1982) tracker.set_year(1987) tracker.set_filename("tbld-87.dat") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 103: continue commod = line[0:6] pce_code = line[14:18] values = { "prod_val": line[21:30], "rail_margin": line[30:39], "truck_margin": line[39:48], "water_margin": line[48:57], "air_margin": line[57:66], "pipe_margin": line[66:75], "wholesale_margin": line[75:84], "retail_margin": line[84:93], "purchase_val": line[93:102], } tracker.insert_row(pce_code, commod, values, 1000) tracker.setup_for_codes() tracker.set_filename("io-nipa.doc") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 27: continue code = line[0:4].strip() desc = line[26:].strip() tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1992) tracker.set_filename("TabD.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: values = { "prod_val": row[4], "rail_margin": row[5], "truck_margin": row[6], "water_margin": row[7], "air_margin": row[8], "pipe_margin": row[9], "gaspipe_margin": row[10], "wholesale_margin": row[11], "retail_margin": row[12], "purchase_val": row[13], } tracker.insert_row(row[2], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: code = row[0] desc = row[4] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1997) tracker.set_filename("AppendixC_Detail.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(row[1], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA_PCE.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: code = row[1] desc = row[2] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(2002) tracker.setup_for_codes() # do this simultaneously since it's all one file tracker.set_filename("2002_PCE_Bridge.xls") wb = tracker.get_file_handle("xls") naics_pat = re.compile('[A-Z0-9]{6}') sheet = wb.sheet_by_name("PCE_Bridge_Detail") pce_codes = [] for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) == 13 and naics_pat.match(row[1]): pce_desc = row[0] # we don't need the distinction between households and # nonprofit institutions service households parts = pce_desc.split('-') if len(parts) > 1: lastpart = parts[-1].strip() if lastpart == 'HH' or lastpart == 'NPISH': pce_desc = '-'.join(parts[:-1]) pce_desc = pce_desc.strip() if pce_desc in pce_codes: pce_code = pce_codes.index(pce_desc) else: pce_code = len(pce_codes) pce_codes.append(pce_desc) tracker.insert_code_row(str(pce_code), pce_desc) values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(str(pce_code), row[1], values, 1000) tracker.flush_codes()
def parse_env(): files = { # 2005 only has 細分類 while 1990: "ei90187p.xls", 1995: "ei95186p.xls", 2000: "ei2000p104v01j.xls", 2005: "ei2005pc403jp_wt_bd.xlsx", } def series_names_from_rows(names, units): # since these tables are structured identically # we'll just do some hard coding series_names = [] for i in range(3, len(names)): if len(names[i]): name = "%s (%s)" % (names[i], units[i]) else: name = None series_names.append(name) return series_names tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_env_table(year, series_max_length=255) codes = tables.new_sector_codes(year, "env_ind") codes.curate_code_from_desc("総合計", "total") codes.blacklist_code("total") path = fileutils.getcache(filename, "jp", str(year)) if filename.endswith("xls"): wb = xlrd.open_workbook(path) # each xls file starts with ToC listing tables A-E. # E1: 部門別直接エネルギー消費量,エネルギー原単位を掲載 # E2: 部門別直接CO2排出量,CO2排出原単位を掲載 for sheetname in ("E1", "E2"): sheet = wb.sheet_by_name(sheetname) min_series_col = 4 # first col whose values interest us if sheetname == "E1": min_series_col = 3 # GDP - only want this once series_names = series_names_from_rows(sheet.row_values(0), sheet.row_values(1)) for i in range(2, sheet.nrows): row = sheet.row_values(i) code = row[1] if type(code) is float: code = str(int(code)).rjust(3, "0") code = codes.set_code(code, row[2]) if code: for (series, value) in zip(series_names, row[3:]): if type(value) is float: tables.insert_env(year, code, series, value) elif filename.endswith("xlsx"): wb = openpyxl.load_workbook(filename=path, use_iterators=True) # E: 部門別直接エネルギー消費量および各種GHG排出量, # エネルギー原単位およびGHG原単位を掲載 sheet = wb.get_sheet_by_name("E") rows = sheet.iter_rows() series_names = series_names_from_rows( [cell.internal_value for cell in next(rows)], [cell.internal_value for cell in next(rows)]) for row in rows: code = codes.set_code(row[1].internal_value, row[2].internal_value) if code: for (series, cell) in zip(series_names, row[3:]): if cell.internal_value is not None: tables.insert_env(year, code, series, cell.internal_value) codes.update_codes()
def parse_text(self, rowcallback): path = fileutils.getcache(filename, str(self.year)) f = open(path) for line in f: rowcallback(line, this) f.close()
import csv from usa import bea, config, eia, common, wiod_code_map from common.dbconnect import db from common import fileutils, utils, sqlhelper old_meat_codes = ["140101", "140102", "140103", "140105"] new_meat_codes = ["311611", "311612", "311615", "31161A"] combined_meat_codes = old_meat_codes + new_meat_codes for year in config.STUDY_YEARS: print(year) path = fileutils.getcache("fossil_fuel_estimates_%d.csv" % year, "usa") fh = open(path, "r") csvf = csv.reader(fh) io_codes = common.io_codes_for_year(year) data = {} row = next(csvf) for row in csvf: if len(row) == 6: sector = row[0] btu = row[1] # total #btu = row[2] # coal #btu = row[3] # natural gas #btu = row[5] # PA-nontrans data[sector] = float(btu)
def doparse(): tracker = IOTableStateTracker() #tracker.create_simple_transaction_table( # "1947", "1947/1947 Transactions 85-level Data.txt") #tracker.create_simple_transaction_table( # "1958", "1958/1958 Transactions 85-level Data.txt") #tracker.create_simple_transaction_table( # "1963", "1963/1963 Transactions 367-level Data.txt") #tracker.create_simple_transaction_table( # "1967", "1967/1967 Transactions 484-level Data.txt", 1000) tracker.create_simple_make_use( "1972", "1972/1972 Transactions 496-level Data.txt", 1000) tracker.create_simple_make_use( "1977", "1977/1977 Transactions 537-level Data.txt", 1000) tracker.create_make_table("1982") tracker.create_use_table("1982", True) with open(fileutils.getcache("82-6DT.DAT", "1982"), "r") as f: for line in f: if len(line) >= 112: # right-aligned input_ind = line[0:6] output_ind = line[6:12] use_dollars = line[12:22] make_dollars = line[22:32] tracker.insert_make(input_ind, output_ind, make_dollars, 100) tracker.insert_use(input_ind, output_ind, use_dollars, {"margins": line[32:42], "rail_margin": line[42:52], "truck_margin": line[52:62], "water_margin": line[62:72], "air_margin": line[72:82], "pipe_margin": line[82:92], "wholesale_margin": line[92:102], "retail_margin": line[102:112]}, 100) # this year dollars are in 100,000s tracker.create_make_table("1987") with open(fileutils.getcache("TBL1-87.DAT", "1987"), "r") as f: for line in f: if len(line) >= 24: # right-aligned tracker.insert_make( line[0:6], line[7:13], line[15:24], 1000) tracker.create_use_table("1987", True) with open(fileutils.getcache("TBL2-87.DAT", "1987"), "r") as f: for line in f: if len(line) >= 96: # right-aligned input_ind = line[0:6] output_ind = line[7:13] use_dollars = line[15:24].strip() tracker.insert_use( input_ind, output_ind, use_dollars, {"margins": line[24:33], "rail_margin": line[33:42], "truck_margin": line[42:51], "water_margin": line[51:60], "air_margin": line[60:69], "pipe_margin": line[69:78], "wholesale_margin": line[78:87], "retail_margin": line[87:96]}, 1000) # the documentation for 1992 appears very incorrect unless there # is some way for tabs to be 7 characters for two fields and 9 # characters for the rest of the fields. we will just assume the # file is an ordinary tab-delimited file. tracker.create_make_table("1992") with open(fileutils.getcache("IOMAKE.TXT", "1992"), "r") as f: for line in f: row = line.split("\t") if len(row) == 4: tracker.insert_make(row[0], row[1], row[3], 1000) tracker.create_use_table("1992", True) with open(fileutils.getcache("IOUSE.TXT", "1992"), "r") as f: for line in f: row = line.split("\t") if len(row) == 13: tracker.insert_use( row[0], row[1], row[3], {"margins": row[4], "rail_margin": row[5], "truck_margin": row[6], "water_margin": row[7], "air_margin": row[8], "pipe_margin": row[9], "gaspipe_margin": row[10], "wholesale_margin": row[11], "retail_margin": line[12]}, 1000) tracker.create_make_table("1997") with open(fileutils.getcache("NAICSMakeDetail.txt", "1997")) as f: csvf = csv.reader(f) for row in csvf: if len(row) == 4: tracker.insert_make(row[0], row[1], row[3], 1000) tracker.create_use_table("1997", True) with open(fileutils.getcache("NAICSUseDetail.txt", "1997")) as f: csvf = csv.reader(f) for row in csvf: if len(row) == 15: tracker.insert_use( row[0], row[1], row[4], {"margins": row[5], "rail_margin": row[6], "truck_margin": row[7], "water_margin": row[8], "air_margin": row[9], "pipe_margin": row[10], "gaspipe_margin": row[11], "wholesale_margin": row[12], "retail_margin": row[13]}, 1000) # contrary to the format documentation, revised 2002 tables are # delimited with mixed tabs and spaces. they appear fixed width with # 8-char tabs. field names fortunately do not contain whitespace. valid_line = re.compile("[A-Z0-9]{6}\s") tracker.create_make_table("2002") with open(fileutils.getcache("REV_NAICSMakeDetail 4-24-08.txt", "2002")) as f: fields = dbsetup.get_header_locations( dbsetup.replace_tabs(f.readline().strip())) for line in f: if valid_line.match(line): row = dbsetup.get_values_for_fields(dbsetup.replace_tabs(line), fields) tracker.insert_make( row["Industry"], row["Commodity"], row["ProVal"], 1000) tracker.create_use_table("2002", True) with open(fileutils.getcache("REV_NAICSUseDetail 4-24-08.txt", "2002")) as f: # cheat here because it's not worth the trouble to deal with # lack of whitespace between two fields (GasPipeVal and WhsVal) line = f.readline().strip().replace("GasPipeVal", "GasPipe ") fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line)) for line in f: if valid_line.match(line): row = dbsetup.get_values_for_fields( dbsetup.replace_tabs(line), fields) tracker.insert_use( row["Commodity"], row["Industry"], row["ProVal"], {"margins": row["StripMar"], "rail_margin": row["RailVal"], "truck_margin": row["TruckVal"], "water_margin": row["WaterVal"], "air_margin": row["AirVal"], "pipe_margin": row["PipeVal"], "gaspipe_margin": row["GasPipe"], "wholesale_margin": row["WhsVal"], "retail_margin": row["RetVal"]}, 1000) tracker.flush()
def parse_io(): ### for ind x ind tables tables = {} colnames = ["country", "from_ind", "to_ind", "is_import", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "bool", "float"] for year in config.STUDY_YEARS: tablename = "%s.niot_%d" % (config.WIOD_SCHEMA, year) tables[year] = SQLTable(tablename, colnames, coltypes)#.create() tables[year].drop() tables[year].create() tables[year].truncate() va_sectors = set(config.va_sectors.values()) for country in config.countries.keys(): filename = "%s_NIOT_ROW_Apr12.xlsx" % country subdir = os.path.join("wiod", "niot") path = fileutils.getcache(filename, subdir) wb = openpyxl.load_workbook(filename=path, use_iterators=True) for year in config.STUDY_YEARS: imports = {} sheet = wb.get_sheet_by_name("%d" % year) rows = sheet.iter_rows() industry_row = None for row in rows: cell = row[0] if cell.internal_value == "(industry-by-industry)": industry_row = row break row = next(rows) # industry names industry_codes = [] for (code_cell, desc_cell) in zip(industry_row, row): code = code_cell.internal_value desc = desc_cell.internal_value industry_codes.append(industry_tracker.set_code(code, desc)) for row in rows: from_code = None from_desc = None is_import = False for (to_code, value_cell) in zip(industry_codes, row): column = value_cell.column value = value_cell.internal_value # excel columns use letters if column == "A": from_code = value_cell.internal_value elif column == "B": from_desc = value_cell.internal_value elif column == "C": from_code = industry_tracker.set_code( from_code, from_desc) if not from_code: break if type(value) is str and value == "Imports": is_import = True elif (column > "D" or len(column) > 1) \ and to_code and value != 0: tables[year].insert( [country, from_code, to_code, is_import, value]) ### for supply and use tables def parse_sut(sheet_name, table_prefix): tables = {} colnames = ["country", "commodity", "industry", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] for year in config.STUDY_YEARS: tablename = "%s_%d" % (table_prefix, year) tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() for country in config.countries.keys(): # TODO: more automated way to get this if country in ("AUS", "DEU", "GBR", "USA"): filename = "%s_SUT_Feb12.xls" % country else: filename = "%s_SUT_Jan12.xls" % country subdir = os.path.join("wiod", "suts") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) # extract supply and use tables at fob prices sheet = wb.sheet_by_name(sheet_name) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) if not len(row[0].strip()): continue year = int(row[0]) if year not in config.STUDY_YEARS: continue com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first tables[year].insert( [country, com_code, ind_code, value]) # make tables parse_sut("SUP_bas", "%s.make" % config.WIOD_SCHEMA) # use tables parse_sut("USE_bas", "%s.use" % config.WIOD_SCHEMA)
def parse_nipa_data(): test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA db.execute("DROP VIEW IF EXISTS %s" % test_view) # get table for pce category harmonization trailing_pat = re.compile('(.+) \(.*\d.*\)$') nipa_code_map = {} filename = fileutils.getdatapath("nipa_code_map.csv", "usa") fh = open(filename) csvf = csv.reader(fh) for row in csvf: if len(row) == 2: harmonized = row[0] trailing = trailing_pat.match(harmonized) if trailing: harmonized = trailing.group(1) nipa_code_map[row[1]] = harmonized fh.close() # get nipa series codes from underlying detail tables tracker = TableStateTracker() tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA, ["code", "parent", "description"], ["char(7)", "char(7)", "text"], True) number_pat = re.compile('^\d+$') trailing_pat = re.compile('(.+) \(.*\d.*\)$') filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False code_stack = [None] indent_stack = [-1] # the code mapping has been done such that each item is at least at # three levels of disaggregation below the top, i.e. there is always # an ancestor at the second level. we only want to keep track of the # ancestor at the third level (root is zero) # the first level below root has goods and services # the second level has durable goods, nondurable goods, and services. reverse_code_dict = {} second_level_nodes = [] for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 2.4.5U"): is_in_table = True else: if row[0].startswith("Table 2.4.5U"): # we only need to go through one instance of this table break else: if number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() # these are duplicate codes if title.startswith("Market-based PCE"): continue code = row[2] current_indent = len(row[1]) - len(title) while current_indent <= indent_stack[-1]: indent_stack.pop() code_stack.pop() indent_stack.append(current_indent) code_stack.append(code) if len(code_stack) > 1: parent = code_stack[-2] else: parent = None title = title.strip() trailing = trailing_pat.match(title) if trailing: title = trailing.group(1) if len(code_stack) > 4: reverse_code_dict[title] = code_stack[3] else: reverse_code_dict[title] = code tracker.insert_row((code, parent, title)) tracker.flush() fh.close() # table for price deflators tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA, ["year", "gdp", "pce"], ["int", "float", "float"]) filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False data = {} # we need to parse two rows before we can populate years = {} for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 1.1.9"): is_in_table = True else: if row[0].startswith("Table 1.1.9"): # this is seasonally adjusted version of the same table break else: if row[0] == "Line": for i in range(len(row)): if number_pat.match(row[i]): year = int(row[i]) years[year] = i data[year] = {} elif number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() if title == "Gross domestic product": column = "gdp" elif title == "Personal consumption expenditures": column = "pce" else: continue for (year, colindex) in years.items(): data[year][column] = float(row[colindex]) for (year, results) in data.items(): tracker.insert_row([year, results["gdp"], results["pce"]]) tracker.flush() fh.close() # parse pce bridge class IONIPAStateTracker(TableStateTracker): def flush(self): TableStateTracker.flush(self) if self.fh is not None and not self.fh.closed: self.fh.close() def __init__(self): TableStateTracker.__init__(self) self.fh = None self.code_dict = None self.value_columns = [ "prod_val", "rail_margin", "truck_margin", "water_margin", "air_margin", "pipe_margin", "gaspipe_margin", "wholesale_margin", "retail_margin", "purchase_val" ] self.old_style_field_map = { "Producers' Value": "prod_val", "MfgExciseTax": "prod_val", "RailMargin": "rail_margin", "TruckMargin": "truck_margin", "WaterMargin": "water_margin", "AirMargin": "air_margin", "PipeMargin": "pipe_margin", "WholesaleMargin": "wholesale_margin", "WholesaleTax": "wholesale_margin", "RetailMargin": "retail_margin", "RetailSalesTax": "retail_margin", "OtherRetailTax": "retail_margin", "Purchasers' Value": "purchase_val", } def set_filename(self, filename): path = fileutils.getcache(filename, str(self.year)) self.filename = path def set_year(self, year): self.flush() self.year = year tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year) fields = ["pce_code", "commodity"] + self.value_columns types = ["varchar(6)", "varchar(6)"] + \ ["bigint"]*len(self.value_columns) self.create_table(tablename, fields, types) def setup_for_codes(self): self.code_dict = {} def flush_codes(self): if self.code_dict is not None: tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year) self.create_table(tablename, ["pce_code", "nipa_group", "description"], ["varchar(6)", "char(7)", "text"]) for (code, raw_desc) in self.code_dict.items(): desc = raw_desc if desc.endswith('(s.)') or desc.endswith('(d.)'): desc = desc[:-4].strip() elif desc.endswith('(n.d.)'): desc = desc[:-6].strip() if desc in nipa_code_map: desc = nipa_code_map[desc] if desc in reverse_code_dict: nipa_code = reverse_code_dict[desc] else: nipa_code = None #self.current_stmt(code, nipa_code, raw_desc) self.table.insert([code, nipa_code, raw_desc]) self.code_dict = None self.flush() def insert_code_row(self, code, desc): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(code) is float: code = int(code) self.code_dict[str(code)] = desc.strip() def insert_row(self, pce_code, commod, dollar_values, factor=1): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(pce_code) is float: pce_code = int(pce_code) values = [str(pce_code).strip(), commod.strip()] for column in self.value_columns: if column in dollar_values: if factor == 1: values.append(dollar_values[column]) else: values.append( int(float(dollar_values[column]) * factor)) else: values.append(None) #self.current_stmt(*values) self.table.insert(values) def parse_old_style_xls(self, year): self.set_year(year) self.set_filename("%d_PCE_Commodity.xls" % self.year) wb = xlrd.open_workbook(self.filename) # parse pce bridge data sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year) field_indexes = {} pce_code_idx = 0 commod_idx = 2 for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: if "PCE Category" in row: pce_code_idx = row.index("PCE Category") if "Commodity" in row: commod_idx = row.index("Commodity") for i in range(len(row)): xls_col = row[i] if xls_col in self.old_style_field_map: colname = self.old_style_field_map[xls_col] if colname not in field_indexes: field_indexes[colname] = [] field_indexes[colname].append(i) elif len(field_indexes): pce_code = row[pce_code_idx] commod = str(int(row[commod_idx])).rjust(6, "0") values = {} for (field, columns) in field_indexes.items(): # doclumentation says units are in 100,000 dollars # but the orders of magnitude don't match up with # later years if we use 100 components = [ int(float(row[column] * 1000)) for column in columns ] value = 0 for component in components: value += component values[field] = value self.insert_row(pce_code, commod, values) # parse codes from neighboring worksheet self.setup_for_codes() sheet = wb.sheet_by_name("%d PCE Category Descriptions" % self.year) code_idx = None desc_idx = None for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: codetab = "PCE Category Code" codetab2 = "%s - %d" % (codetab, self.year) if codetab in row or codetab2 in row: if codetab in row: code_idx = row.index(codetab) else: code_idx = row.index(codetab2) desctab = "PCE Category Description - %d" % self.year if desctab in row: desc_idx = row.index(desctab) else: desctab = "PCE Category Description" if desctab in row: desc_idx = row.index(desctab) elif code_idx is not None and desc_idx is not None: code = row[code_idx] desc = str(row[desc_idx]) self.insert_code_row(code, desc) self.flush_codes() def get_file_handle(self, filetype, options={}): if filetype == "txt": self.fh = open(self.filename) return self.fh elif filetype == "csv": self.fh = open(self.filename) if "delim" in options: csvf = csv.reader(self.fh, delimiter=options["delim"]) else: csvf = csv.reader(self.fh) return csvf elif filetype == "xls": wb = xlrd.open_workbook(self.filename) return wb def parse_text(self, rowcallback): path = fileutils.getcache(filename, str(self.year)) f = open(path) for line in f: rowcallback(line, this) f.close() tracker = IONIPAStateTracker() tracker.parse_old_style_xls(1967) tracker.parse_old_style_xls(1972) tracker.parse_old_style_xls(1977) tracker.parse_old_style_xls(1982) tracker.set_year(1987) tracker.set_filename("tbld-87.dat") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 103: continue commod = line[0:6] pce_code = line[14:18] values = { "prod_val": line[21:30], "rail_margin": line[30:39], "truck_margin": line[39:48], "water_margin": line[48:57], "air_margin": line[57:66], "pipe_margin": line[66:75], "wholesale_margin": line[75:84], "retail_margin": line[84:93], "purchase_val": line[93:102], } tracker.insert_row(pce_code, commod, values, 1000) tracker.setup_for_codes() tracker.set_filename("io-nipa.doc") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 27: continue code = line[0:4].strip() desc = line[26:].strip() tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1992) tracker.set_filename("TabD.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: values = { "prod_val": row[4], "rail_margin": row[5], "truck_margin": row[6], "water_margin": row[7], "air_margin": row[8], "pipe_margin": row[9], "gaspipe_margin": row[10], "wholesale_margin": row[11], "retail_margin": row[12], "purchase_val": row[13], } tracker.insert_row(row[2], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: code = row[0] desc = row[4] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1997) tracker.set_filename("AppendixC_Detail.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(row[1], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA_PCE.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: code = row[1] desc = row[2] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(2002) tracker.setup_for_codes() # do this simultaneously since it's all one file tracker.set_filename("2002_PCE_Bridge.xls") wb = tracker.get_file_handle("xls") naics_pat = re.compile('[A-Z0-9]{6}') sheet = wb.sheet_by_name("PCE_Bridge_Detail") pce_codes = [] for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) == 13 and naics_pat.match(row[1]): pce_desc = row[0] # we don't need the distinction between households and # nonprofit institutions service households parts = pce_desc.split('-') if len(parts) > 1: lastpart = parts[-1].strip() if lastpart == 'HH' or lastpart == 'NPISH': pce_desc = '-'.join(parts[:-1]) pce_desc = pce_desc.strip() if pce_desc in pce_codes: pce_code = pce_codes.index(pce_desc) else: pce_code = len(pce_codes) pce_codes.append(pce_desc) tracker.insert_code_row(str(pce_code), pce_desc) values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(str(pce_code), row[1], values, 1000) tracker.flush_codes()
def doparse(): # ppp rank from # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html countries = { "LUX": {"fips": "LU", "ppp": 3}, "USA": {"fips": "US", "ppp": 11}, "NLD": {"fips": "NL", "ppp": 17}, "AUT": {"fips": "AU", "ppp": 18}, "SWE": {"fips": "SW", "ppp": 21}, "CAN": {"fips": "CA", "ppp": 20}, "AUS": {"fips": "AS", "ppp": 22}, "IRL": {"fips": "EI", "ppp": 23}, "DEU": {"fips": "GM", "ppp": 26}, "TWN": {"fips": "TW", "ppp": 27}, "BEL": {"fips": "BE", "ppp": 28}, "DNK": {"fips": "DK", "ppp": 29}, "FIN": {"fips": "FI", "ppp": 32}, "GBR": {"fips": "UK", "ppp": 33}, "FRA": {"fips": "FR", "ppp": 35}, "JPN": {"fips": "JA", "ppp": 36}, "KOR": {"fips": "KS", "ppp": 40}, "ESP": {"fips": "SP", "ppp": 43}, "ITA": {"fips": "IT", "ppp": 44}, "CYP": {"fips": "CY", "ppp": 46}, "SVN": {"fips": "SI", "ppp": 47}, "CZE": {"fips": "EZ", "ppp": 50}, # EZ?? "GRC": {"fips": "GR", "ppp": 52}, "MLT": {"fips": "MT", "ppp": 53}, "PRT": {"fips": "PO", "ppp": 57}, "SVK": {"fips": "LO", "ppp": 58}, "POL": {"fips": "PL", "ppp": 60}, "EST": {"fips": "EN", "ppp": 61}, "HUN": {"fips": "HU", "ppp": 63}, "LTU": {"fips": "LH", "ppp": 65}, "RUS": {"fips": "RS", "ppp": 71}, "LVA": {"fips": "LG", "ppp": 75}, "MEX": {"fips": "MX", "ppp": 85}, "TUR": {"fips": "TU", "ppp": 86}, "BRA": {"fips": "BR", "ppp": 92}, "ROU": {"fips": "RO", "ppp": 97}, "BGR": {"fips": "BU", "ppp": 101}, "CHN": {"fips": "CH", "ppp": 121}, "IDN": {"fips": "ID", "ppp": 156}, "IND": {"fips": "IN", "ppp": 164}, } tablename = "world_supplement" table = SQLTable(tablename, ["year", "country", "pop", "gdp", "ppp"], ["int", "char(3)", "int", "float", "float"]).create() table.truncate() country_fips = {} data = {} for (country, info) in countries.items(): data[country] = {} country_fips[info["fips"]] = country # this file spec is documented in the xlsx file from the archive thisyear = datetime.datetime.now().year path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) if year >= thisyear: # we don't want future projections continue country = country_fips[fips] data[country][year] = {"pop": int(fields[2])} worldbank = { "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls", "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in countries: country = row[1] for (year, value) in zip(header, row[2:]): # this discards years where we don't have population if year in data[country] and \ type(value) is float and value != 0: data[country][year][indicator] = value for (country, country_data) in data.items(): for (year, year_data) in country_data.items(): ppp = None gdp = None pop = year_data["pop"] if "gdp" in year_data: gdp = year_data["gdp"] if "ppp" in year_data: ppp = year_data["ppp"] table.insert([year, country, pop, gdp, ppp])
def doparse(): carrier_countries = { #"-": "", # Unknown "1I": "USA", # Sky Trek International Airlines "2T": "CAN", # Canada 3000 Airlines Ltd. "3Z": "USA", # Tatonduk Outfitters Limited d/b/a Everts Air Alaska and Everts Air Cargo "5X": "USA", # United Parcel Service "5Y": "USA", # Atlas Air Inc. "6F": "GBR", # Laker Airways Inc. #"6U": "", # Air Ukraine #"6Y": "", # Nicaraguense De Aviacion Sa #"7P": "", # Apa International Air S.A. (dominican rep) #"7Z": "", # Lb Limited "8C": "USA", # Air Transport International "AA": "USA", # American Airlines Inc. "AC": "CAN", # Air Canada #"ADB": "", # Antonov Company (ukraine) "AF": "FRA", # Compagnie Nat'l Air France "AI": "IND", # National Aviation Company of India Limited d/b/a Air India "AM": "MEX", # Aeromexico #"AQQ": "", # Air Charter (Safa) #"AR": "", # Aerolineas Argentinas "AS": "USA", # Alaska Airlines Inc. #"AT": "", # Royal Air Maroc (morocco) #"AV": "", # Aerovias Nac'l De Colombia "AY": "FIN", # Finnair Oy "AZ": "ITA", # Compagnia Aerea Italiana #"All Rows": "", # All Rows (including those not displayed) "BA": "GBR", # British Airways Plc #"BBQ": "", # Balair Ag (swiss) "BCQ": "CAN", # Bradley Air Services Ltd. #"BG": "", # Biman Bangladesh Airlines "BQ": "MEX", # Aeromar C. Por A. "BR": "TWN", # Eva Airways Corporation #"BW": "", # Caribbean Airlines Limited (trinidad and tobago) "BY": "GBR", # Britannia Airways Ltd. "CA": "CHN", # Air China #"CC": "", # Air Atlanta Icelandic "CDQ": "USA", # Kitty Hawk International #"CF": "", # Compan. De Aviacion Faucett (peru) "CI": "TWN", # China Airlines Ltd. #"CLQ": "", # Aero Transcolombiana #"CM": "", # Compania Panamena (Copa) "CO": "USA", # Continental Air Lines Inc. "CP (1)": "CAN", # Canadian Airlines International Ltd. "CS": "USA", # Continental Micronesia "CV": "LUX", # Cargolux Airlines International S.A #"CVQ": "", # Caraven S.A. #"CX": "", # Cathay Pacific Airways Ltd. (hong kong, includes pre 1997) "CYQ": "FRA", # Corse Air International (assuming corsair) "CZ": "CHN", # China Southern Airlines "DE": "DEU", # Condor Flugdienst "DHQ": "GBR", # DHL Aero Expresso "DL": "USA", # Delta Air Lines Inc. #"ED": "", # Andes (ecuador or argentina) "EH": "ESP", # Saeta Airlines "EI": "IRL", # Aer Lingus Plc #"EOQ": "", # Aeroservicios Ecuatorianos "ER": "USA", # Astar USA, LLC #"EU": "", # Ecuatoriana De Aviacion #"EXQ": "", # Export Air Del Peru S.A. "EZ": "TWN", # Evergreen International Inc. "F9": "USA", # Frontier Airlines Inc. "FCQ": "USA", # Falcon Air Express #"FF": "", # Tower Air Inc. #"FI": "", # Icelandair #"FJ": "", # Air Pacific Ltd. (fiji) "FNQ": "USA", # Fine Airlines Inc. #"FQ": "", # Air Aruba #"FS": "", # Serv De Trans Aereos Fuegui (argentina) "FX": "USA", # Federal Express Corporation #"G3": "", # Aerochago S.A. "GA": "IDN", # P.T. Garuda Indonesian Arwy "GD": "MEX", # Transp. Aereos Ejecutivos #"GF": "", # Gulf Air Company (bahrain) #"GH": "", # Ghana Airways Corporation "GJ (1)": "MEX", # Mexicargo "GL": "USA", # Miami Air International "GR": "USA", # Gemini Air Cargo Airways #"GU": "", # Aviateca (guatemala) #"GY": "", # Guyana Airways Corporation "H2": "BEL", # City Bird "H5": "RUS", # Magadan Airlines "HA": "USA", # Hawaiian Airlines Inc. "HAQ": "DEU", # Hapag Lloyd Flug. "HCQ": "USA", # Av Atlantic #"HFQ": "", # Haiti Air Freight Intl "HLQ": "AUS", # Heavylift Cargo Airlines Lt "HP": "USA", # America West Airlines Inc. (Merged with US Airways 9/05. Stopped reporting 10/07.) #"HY": "", # Uzbekistan Airways "IB": "ESP", # Iberia Air Lines Of Spain #"ITQ": "", # Interamericana De Aviacion (uruguay) "IW": "FRA", # Air Liberte Aka Aom Minerve #"JAQ": "", # Jamaica Air Freighters "JD": "JPN", # Japan Air System Co. Ltd. "JI (1)": "USA", # Midway Airlines Inc. "JK": "ESP", # Spanair S.A. "JKQ": "USA", # Express One International Inc. "JL": "JPN", # Japan Air Lines Co. Ltd. #"JM": "", # Air Jamaica Limited "JR": "USA", # Aero California "JW": "CAN", # Arrow Air Inc. "JZ": "JPN", # Japan Air Charter Co. Ltd. "K8 (1)": "NLD", # Dutch Caribbean Airlines "KE": "KOR", # Korean Air Lines Co. Ltd. "KH": "USA", # Aloha Air Cargo #"KI": "", # Time Air Ltd. (south africa) "KL": "NLD", # Klm Royal Dutch Airlines #"KP": "", # Kiwi International "KR": "USA", # Kitty Hawk Aircargo "KTQ": "TUR", # Turks Air Ltd. #"KU": "", # Kuwait Airways Corp. "KW": "USA", # Carnival Air Lines Inc. #"KX": "", # Cayman Airways Limited "KZ": "JPN", # Nippon Cargo Airlines #"LA": "", # Lan-Chile Airlines #"LB": "", # Lloyd Aereo Boliviano S. A. "LGQ": "MEX", # Lineas Aereas Allegro "LH": "DEU", # Lufthansa German Airlines "LO": "POL", # Polskie Linie Lotnicze #"LR": "", # Lacsa (costa rica) #"LSQ": "", # Lineas Aereas Suramerican (colombia) "LT": "DEU", # Luftransport-Unternehmen #"LU": "", # Air Atlantic Dominicana #"LY": "", # El Al Israel Airlines Ltd. "LZ": "BGR", # Balkan Bulgarian Airlines "M6": "USA", # Amerijet International "M7": "MEX", # Aerotransportes Mas De Crga "MA": "HUN", # Malev Hungarian Airlines "MG": "USA", # Champion Air #"MH": "", # Malaysian Airline System #"ML": "", # Aero Costa Rica "MP": "NLD", # Martinair Holland N.V. #"MS": "", # Egyptair "MT": "GBR", # Thomas Cook Airlines Uk Ltd. "MT (1)": "GBR", # Flying Colours Airlines Ltd. "MU": "CHN", # China Eastern Airlines #"MUQ": "", # Aerolineas Mundo (columbia) "MX": "MEX", # Compania Mexicana De Aviaci #"MYQ": "", # Lineas Aereas Mayas (Lamsa) #"N5 (1)": "", # Nations Air Express Inc. "NA": "USA", # North American Airlines "NG": "DEU", # Lauda Air Luftfahrt Ag "NH": "JPN", # All Nippon Airways Co. "NK": "USA", # Spirit Air Lines "NW": "USA", # Northwest Airlines Inc. "NWQ": "USA", # N. W. Territorial Airways #"NZ": "", # Air New Zealand "OA": "GRC", # Olympic Airways #"OI": "", # Prestige Airways (uae) "OK": "CZE", # Czech Airlines #"ON": "", # Air Nauru "OS": "AUT", # Austrian Airlines "OW": "USA", # Executive Airlines "OZ": "KOR", # Asiana Airlines Inc. "PA (2)": "USA", # Pan American World Airways "PCQ": "USA", # Pace Airlines #"PIQ": "", # Pacific International Airlines (ambiguous: usa, panama) #"PK": "", # Pakistan International Airlines #"PL": "", # Aero Peru "PNQ": "USA", # Panagra Airways "PO": "USA", # Polar Air Cargo Airways #"PR": "", # Philippine Airlines Inc. "PRQ": "USA", # Florida West Airlines Inc. "PT": "USA", # Capital Cargo International #"PY": "", # Surinam Airways Limited "Q7": "BEL", # Sobelair "QF": "AUS", # Qantas Airways Ltd. "QK": "CAN", # Jazz Aviation LP #"QN": "", # Royal Air (ambiguous) "QO": "MEX", # Aeromexpress "QQ": "USA", # Reno Air Inc. #"QT": "", # Transportes Aereos Mercantiles Panamericanos S.A (colombia) "QTQ": "IRL", # Aer Turas Teoranta "QX": "USA", # Horizon Air "RD": "USA", # Ryan International Airlines "REQ": "USA", # Renown Aviation "RG": "BRA", # Varig S. A. #"RJ": "", # Alia-(The) Royal Jordanian #"RK": "", # Air Afrique "RNQ": "GBR", # Mytravel Airways "RO": "ROU", # Tarom Romanian Air Transpor #"SA": "", # South African Airways "SAQ": "USA", # Southern Air Transport Inc. "SEQ": "GBR", # Sky Service F.B.O. "SIQ": "LUX", # Premiair "SK": "SWE", # Scandinavian Airlines Sys. "SM": "USA", # Sunworld International Airlines "SN (1)": "BEL", # Sabena Belgian World Air. "SPQ": "USA", # Sun Pacific International #"SQ": "", # Singapore Airlines Ltd. #"SR": "", # Swissair Transport Co. Ltd. "SU": "RUS", # Aeroflot Russian Airlines #"SV": "", # Saudi Arabian Airlines Corp "SX (1)": "MEX", # Aeroejecutivo S.A. "SY": "USA", # Sun Country Airlines d/b/a MN Airlines "T9": "USA", # TransMeridian Airlines #"TA": "", # Taca International Airlines (el savador) "TCQ": "USA", # Express.Net Airlines #"TG": "", # Thai Airways International Ltd. "TK": "TUR", # Turk Hava Yollari A.O. "TKQ": "USA", # Trans-Air-Link Corporation "TNQ": "USA", # Emery Worldwide Airlines "TP": "PRT", # Tap-Portuguese Airlines "TR": "BRA", # Transbrasil S.A. "TRQ": "SWE", # Blue Scandinavia Ab "TS": "CAN", # Air Transat "TW": "USA", # Trans World Airways LLC #"TZ": "", # ATA Airlines d/b/a ATA (iran) "TZQ": "GBR", # First Choice Airways "U7": "USA", # USA Jet Airlines Inc. "UA": "USA", # United Air Lines Inc. #"UD": "", # Fast Air Carrier Ltd. "UN": "RUS", # Transaero Airlines #"UP": "", # Bahamasair Holding Limited "US": "USA", # US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.) "UX": "ESP", # Air Europa #"UYQ": "", # Aerolineas Uruguayas S.A. #"VA (1)": "", # Venezuelan International Airways #"VC": "", # Servicios Avensa (venezuela) #"VE": "", # Aerovias Venezolanas-Avensa "VIQ": "RUS", # Volga-Dnepr Airlines "VP": "BRA", # Viacao Aerea Sao Paulo #"VR": "", # Transportes Aereos De Cabo (cape verde) "VS": "GBR", # Virgin Atlantic Airways #"VX (1)": "", # Aces Airlines (colombia) #"W7": "", # Western Pacific Airlines (solomon islands) #"WD": "", # Halisa Air (haiti) "WE": "USA", # Centurion Cargo Inc. "WO": "USA", # World Airways Inc. #"XC": "", # Air Caribbean (1) "XE": "USA", # ExpressJet Airlines Inc. (1) "XJ": "USA", # Mesaba Airlines "XP": "USA", # Casino Express "YX (1)": "USA", # Midwest Airline, Inc. "ZB": "USA", # Monarch Airlines #"ZUQ": "", # Zuliana De Aviacion (venezuela) "ZX (1)": "CAN", # Airbc Ltd. } tablename = "air_carriers" table = SQLTable( tablename, ["year", "carrier", "series", "value"], ["int", "varchar(15)", "varchar(15)", "int"]) table.create() table.truncate() carriers = {} for year in config.STUDY_YEARS: for filestem in ["freight", "passengers"]: filename = filestem + str(year) + ".csv" path = fileutils.getcache(filename, "bts") with open(path) as fh: csvf = csv.reader(fh) next(csvf) header = next(csvf) for row in csvf: if len(row) == 3: carrier = row[0] #carrier_name = row[1] if carrier in carrier_countries: country = carrier_countries[carrier] value = int(row[2]) table.insert([year, country, filestem, value])
def parse_io(): # choose 中分類 for all io tables. # 中分類 for 1990 and 1995 don't break down the electronic # sectors as far as i would like, so use 小分類 files = { 1990: "l00_21.xls", 1995: "l00_21.xls", 2000: "io00a301.xls", 2005: "io05a301.xls", } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): # 1995 and 2000 io tables: easiest tables.add_io_table(year) codes = tables.new_sector_codes(year) # for 1995 use the heisei 2-7-12 file since it has more # harmonized sectors than the standalone 1995 file if year == 1995: sheetindex = 2 else: # the first page of the heisei 2-7-12 file (used for 1990) # happens to be 1990 at nominal prices, matching the others sheetindex = 0 path = fileutils.getcache(filename, "jp", str(year)) wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(sheetindex) ind_names = None ind_codes = None for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if cell == 1: ind_codes = [str(c).strip().rjust(3, "0") for c in row] break if cell.strip() == "001": ind_codes = row break elif ind_names is None: ind_names = row temp_codes = [None, None] for i in range(2, len(row)): temp_codes.append( codes.set_code(ind_codes[i], row[i])) ind_codes = temp_codes else: from_code = row[0] if type(from_code) is float: from_code = str(int(from_code)).rjust(3, "0") from_code = codes.set_code(from_code, row[1]) if from_code: for i in range(2, len(row)): to_code = ind_codes[i] value = row[i] tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable( "cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column + 1:], rowdata[max_sector_column + 1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append( -abs(abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1] * len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i - 1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len( columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
def parse_env(): files = { # 2005 only has 細分類 while 1990: "ei90187p.xls", 1995: "ei95186p.xls", 2000: "ei2000p104v01j.xls", 2005: "ei2005pc403jp_wt_bd.xlsx", } def series_names_from_rows(names, units): # since these tables are structured identically # we'll just do some hard coding series_names = [] for i in range(3, len(names)): if len(names[i]): name = "%s (%s)" % (names[i], units[i]) else: name = None series_names.append(name) return series_names tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_env_table(year, series_max_length=255) codes = tables.new_sector_codes(year, "env_ind") codes.curate_code_from_desc("総合計", "total") codes.blacklist_code("total") path = fileutils.getcache(filename, "jp", str(year)) if filename.endswith("xls"): wb = xlrd.open_workbook(path) # each xls file starts with ToC listing tables A-E. # E1: 部門別直接エネルギー消費量,エネルギー原単位を掲載 # E2: 部門別直接CO2排出量,CO2排出原単位を掲載 for sheetname in ("E1", "E2"): sheet = wb.sheet_by_name(sheetname) min_series_col = 4 # first col whose values interest us if sheetname == "E1": min_series_col = 3 # GDP - only want this once series_names = series_names_from_rows( sheet.row_values(0), sheet.row_values(1)) for i in range(2, sheet.nrows): row = sheet.row_values(i) code = row[1] if type(code) is float: code = str(int(code)).rjust(3, "0") code = codes.set_code(code, row[2]) if code: for (series, value) in zip(series_names, row[3:]): if type(value) is float: tables.insert_env(year, code, series, value) elif filename.endswith("xlsx"): wb = openpyxl.load_workbook(filename=path, use_iterators=True) # E: 部門別直接エネルギー消費量および各種GHG排出量, # エネルギー原単位およびGHG原単位を掲載 sheet = wb.get_sheet_by_name("E") rows = sheet.iter_rows() series_names = series_names_from_rows( [cell.internal_value for cell in next(rows)], [cell.internal_value for cell in next(rows)]) for row in rows: code = codes.set_code(row[1].internal_value, row[2].internal_value) if code: for (series, cell) in zip(series_names, row[3:]): if cell.internal_value is not None: tables.insert_env(year, code, series, cell.internal_value) codes.update_codes()
def doparse(): # ppp rank from # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html countries = { "LUX": { "fips": "LU", "ppp": 3 }, "USA": { "fips": "US", "ppp": 11 }, "NLD": { "fips": "NL", "ppp": 17 }, "AUT": { "fips": "AU", "ppp": 18 }, "SWE": { "fips": "SW", "ppp": 21 }, "CAN": { "fips": "CA", "ppp": 20 }, "AUS": { "fips": "AS", "ppp": 22 }, "IRL": { "fips": "EI", "ppp": 23 }, "DEU": { "fips": "GM", "ppp": 26 }, "TWN": { "fips": "TW", "ppp": 27 }, "BEL": { "fips": "BE", "ppp": 28 }, "DNK": { "fips": "DK", "ppp": 29 }, "FIN": { "fips": "FI", "ppp": 32 }, "GBR": { "fips": "UK", "ppp": 33 }, "FRA": { "fips": "FR", "ppp": 35 }, "JPN": { "fips": "JA", "ppp": 36 }, "KOR": { "fips": "KS", "ppp": 40 }, "ESP": { "fips": "SP", "ppp": 43 }, "ITA": { "fips": "IT", "ppp": 44 }, "CYP": { "fips": "CY", "ppp": 46 }, "SVN": { "fips": "SI", "ppp": 47 }, "CZE": { "fips": "EZ", "ppp": 50 }, # EZ?? "GRC": { "fips": "GR", "ppp": 52 }, "MLT": { "fips": "MT", "ppp": 53 }, "PRT": { "fips": "PO", "ppp": 57 }, "SVK": { "fips": "LO", "ppp": 58 }, "POL": { "fips": "PL", "ppp": 60 }, "EST": { "fips": "EN", "ppp": 61 }, "HUN": { "fips": "HU", "ppp": 63 }, "LTU": { "fips": "LH", "ppp": 65 }, "RUS": { "fips": "RS", "ppp": 71 }, "LVA": { "fips": "LG", "ppp": 75 }, "MEX": { "fips": "MX", "ppp": 85 }, "TUR": { "fips": "TU", "ppp": 86 }, "BRA": { "fips": "BR", "ppp": 92 }, "ROU": { "fips": "RO", "ppp": 97 }, "BGR": { "fips": "BU", "ppp": 101 }, "CHN": { "fips": "CH", "ppp": 121 }, "IDN": { "fips": "ID", "ppp": 156 }, "IND": { "fips": "IN", "ppp": 164 }, } tablename = "world_supplement" table = SQLTable(tablename, ["year", "country", "pop", "gdp", "ppp"], ["int", "char(3)", "int", "float", "float"]).create() table.truncate() country_fips = {} data = {} for (country, info) in countries.items(): data[country] = {} country_fips[info["fips"]] = country # this file spec is documented in the xlsx file from the archive thisyear = datetime.datetime.now().year path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) if year >= thisyear: # we don't want future projections continue country = country_fips[fips] data[country][year] = {"pop": int(fields[2])} worldbank = { "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls", "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in countries: country = row[1] for (year, value) in zip(header, row[2:]): # this discards years where we don't have population if year in data[country] and \ type(value) is float and value != 0: data[country][year][indicator] = value for (country, country_data) in data.items(): for (year, year_data) in country_data.items(): ppp = None gdp = None pop = year_data["pop"] if "gdp" in year_data: gdp = year_data["gdp"] if "ppp" in year_data: ppp = year_data["ppp"] table.insert([year, country, pop, gdp, ppp])
def doparse(): tablename = "%s.world_supplement" % config.WIOD_SCHEMA table = SQLTable(tablename, ["year", "country", "measurement", "value"], ["int", "char(3)", "varchar(8)", "float"]) table.create() table.truncate() # census data has more complete population counts country_fips = { "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE", "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL", "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN", "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE", "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL", "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX", "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN", "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA", } # this file spec is documented in the xlsx file from the archive path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) country = country_fips[fips] table.insert([year, country, "pop", int(fields[2])]) # worldbank data has some deflator data that imf doesn't worldbank = { "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls", #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls", #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls", "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in config.countries: country = row[1] for (year, value) in zip(header, row[2:]): if type(value) is float and value != 0: table.insert([year, country, indicator, value]) imf_fields = ( "LP", # population "PPPPC", # ppp per capita "NGDPRPC", # gdp per capita in constant prices "NGDP_D", # gdp deflator ) # this is actually a csv file despite what it's called path = fileutils.getcache("WEOApr2012all.xls", "wsupp") with codecs.open(path, "r", "cp1252") as fh: csvf = csv.reader(fh, dialect=csv.excel_tab) header = next(csvf) year_cols = {} valid_year = re.compile("\d{4}") valid_float = re.compile("-*[\d\.,]+") for i in range(len(header)): if header[i] == "ISO": country_col = i elif header[i] == "WEO Subject Code": subject_col = i elif valid_year.match(header[i]): year_cols[int(header[i])] = i elif header[i] == "Estimates Start After": last_year_col = i for row in csvf: if len(row) > subject_col and row[subject_col] in imf_fields: field = row[subject_col] country = row[country_col] if country not in config.countries: continue if valid_year.match(row[last_year_col]): last_year = int(row[last_year_col]) else: # not clear if this means all values are estimated last_year = 9999 for (year, colnum) in year_cols.items(): value = row[colnum] if valid_float.match(value): #and year < last_year: table.insert([year, country, field, float(value.replace(",", ""))])