def parse_map(): table = SQLTable("%s.code_map" % config.SCHEMA, ["from_code", "to_code", "env_code", "harmonized", "description"], ["varchar(3)", "varchar(6)", "varchar(31)", "char(3)", "text"]).create() table.truncate() filename = "code_map.xls" path = fileutils.getdatapath(filename, "uk") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) def sanitize_code(code): if type(code) is float: code = str(int(code)) if not len(code): code = None return code for i in range(1, sheet.nrows): row = sheet.row_values(i) from_code = sanitize_code(row[0]) to_code = sanitize_code(row[2]) env_code = sanitize_code(row[4]) harmonized = sanitize_code(row[6]) desc = row[7].strip() table.insert([from_code, to_code, env_code, harmonized, desc])
def create_make_table(self, year): print("creating make table for %s..." % year) tablename = "%s.make_%s" % (config.IO_SCHEMA, year) self.make_table = SQLTable(tablename, ["industry", "commodity", "thousands"], ["varchar(6)", "varchar(6)", "bigint"]) self.make_table.create() self.make_table.truncate()
def __init__(self, codetablename, mode="r"): self.mode = mode self.codetable = SQLTable(codetablename, ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = {} self.reverse_code_dict = {} self.setup()
def parse_codes(): path = fileutils.getdatapath("sector_map.xls", "tw-env") wb = xlrd.open_workbook(path) sheets = wb.sheets() for sheet in sheets: year = int(sheet.name) tablename = "%s.sector_map_%d" % (config.SCHEMA, year) if year > 2006: # CxI only, need map between commods and inds colnames = ["io_sector", "env_sector", "harmonized_env", "io_commod", "io_ind"] coltypes = ["varchar(255)"]*5 else: colnames = ["io_sector", "env_sector", "harmonized_env"] coltypes = ["varchar(255)"]*3 table = SQLTable(tablename, colnames, coltypes) table.create() table.truncate() for i in range(sheet.nrows): row = sheet.row_values(i) io_sector = row[0].strip() env_sector = row[1].strip() harmonized_env = row[2] if type(harmonized_env) is float: harmonized_env = str(int(harmonized_env)) harmonized_env = harmonized_env.strip() if year > 2006: io_commod = row[4].strip() io_ind = row[5].strip() table.insert([io_sector, env_sector, harmonized_env, io_commod, io_ind]) else: table.insert([io_sector, env_sector, harmonized_env])
def __init__(self): self.gdp_deflators = {} self.pce_deflators = {} table = SQLTable(TABLE_NAME, ["year", "gdp", "pce"], ["int", "float", "float"]) result = table.getall() for row in result: year = row[0] self.gdp_deflators[year] = row[1] self.pce_deflators[year] = row[2]
def create_use_table(self, year, has_margins=False): print("creating use table for %s..." % year) cols = ["commodity", "industry", "thousands"] coltypes = ["varchar(6)", "varchar(6)", "bigint"] if has_margins: for field in bea.use_table_margins: cols.append(field) coltypes.append("int") tablename = "%s.use_%s" % (config.IO_SCHEMA, year) self.use_table = SQLTable(tablename, cols, coltypes) self.use_table.create() self.use_table.truncate()
class TableStateTracker: def __init__(self): self.xact = None self.table = None def drop_table(self, tablename, cascade=False): self.table = SQLTable(tablename) self.table.drop(cascade) def create_table(self, tablename, cols, coltypes, cascade=False): self.flush() self.table = SQLTable(tablename, cols, coltypes) self.table.drop(cascade) self.table.create() self.warmup() def insert_row(self, values): self.table.insert(values) #self.current_stmt(*values) def warmup(self): self.xact = db.xact(mode="READ WRITE") self.xact.begin() def flush(self): if self.xact is not None: self.xact.commit()
def __init__(self, name): self.name = name self.table = SQLTable( "%s.%s" % (config.WIOD_SCHEMA, name), ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = None
def activate(): TradeResultsTable.__active = True TradeResultsTable.__sqltable = SQLTable( "trade_results", ["year", "country", "is_export", "industry", "value"], ["int", "char(3)", "bool", "varchar(15)", "float"]) TradeResultsTable.__sqltable.create() TradeResultsTable.__sqltable.truncate()
def parse_codes(): ## manually curated sector map table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA, ["io_code", "env_code", "description"], ["varchar(15)", "varchar(15)", "text"]).create() table.truncate() sector_map = fileutils.getdatapath("sector_map.csv", "wiod") fh = open(sector_map, "r") csvf = csv.reader(fh) header = next(csvf) for row in csvf: io_code = row[0].strip() if not len(io_code): io_code = None env_code = row[1].strip() if not len(env_code): env_code = None desc = row[2].strip() table.insert([io_code, env_code, desc]) ## current exchange rates table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA, ["country", "year", "rate"], ["char(3)", "int", "float"]).create() table.truncate() path = fileutils.getcache("exr_wiod.xls", "wiod") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("EXR") year_list = None for i in range(sheet.nrows): row = sheet.row_values(i) if len(row) < 2: continue if year_list is None: if type(row[0]) is str and row[0].strip() == "Country": year_list = [int(cell.strip("_ ")) for cell in row[2:]] else: if type(row[1]) is str and len(row[1].strip()) == 3: country = row[1] if country == "GER": country = "DEU" for (year, value) in zip(year_list, row[2:]): table.insert([country, year, value])
class CodeTracker: def __init__(self, name): self.name = name self.table = SQLTable( "%s.%s" % (config.WIOD_SCHEMA, name), ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = None def setup(self): self.table.create() self.get_codes() # get existing codes from db def get_codes(self): if self.code_dict is None: self.code_dict = {} for (code, desc) in self.table.getall(): self.code_dict[code] = desc def get_desc_for_code(self, code): if code in self.code_dict: return self.code_dict[code] return None # returns the code used if it was recognized, false otherwise def set_code(self, code, desc): if type(code) is str: code = code.strip() elif type(code) is float: code = str(int(code)) if type(desc) is str: desc = desc.strip() if code is None or not len(code): if desc is None or not len(desc): # ignore empty args return False elif desc in config.fd_sectors: # choose manual codes code = config.fd_sectors[desc] elif desc in config.va_sectors: # choose manual codes code = config.va_sectors[desc] else: return False elif code in config.code_blacklist: # ignore invalid values for codes return False if code in self.code_dict and self.code_dict[code] != desc: print(self.code_dict[code], desc) self.code_dict[code] = desc return code def update_codes(self): self.table.truncate() for code in sorted(self.code_dict.keys()): desc = self.code_dict[code] self.table.insert([code, desc])
def doparse(): country_dict = dict((v, k) for k, v in config.countries.items()) country_dict["Slovakia"] = "SVK" sources = ["total", "nuclear", "thermal", "renewable", "geothermal", "solar", "wind", "biomass"] measurements = ["capacity", "consumption"] tablename = "%s.world_power" % ("eia") table = SQLTable( tablename, ["year", "country", "source", "units", "value"], ["int", "char(3)", "varchar(15)", "varchar(4)", "float"]) table.create() table.truncate() for source in sources: for measure in measurements: if measure == "consumption": if source in ("geothermal", "solar", "wind", "biomass"): continue units = "bkWh" elif measure == "capacity": units = "MkW" filename = source + "_" + measure + ".xls" path = fileutils.getcache(filename, "eia") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = None for i in range(sheet.nrows): row = sheet.row_values(i) if header is None: if len(row) > 2 and type(row[2]) is float: header = [] for cell in row: if type(cell) is float: header.append(int(cell)) else: header.append(None) header_len = len(header) elif len(row) > 2: country_name = row[0] if country_name in country_dict: country = country_dict[country_name] for i in range(2, header_len): value = row[i] year = header[i] if type(value) is float and value > 0: table.insert( [year, country, source, units, value])
def __init__(self, codetablename, mode="r"): self.mode = mode self.codetable = SQLTable( codetablename, ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = {} self.reverse_code_dict = {} self.setup()
def add_io_table(self, year, sector_max_length=15): year = self.valid_year(year) if year not in self.io_tables: tablename = "%s.%s_%d" % (self.schema, self.io_prefix, year) colnames = ["from_sector", "to_sector", "value"] coltypes = [ "varchar(%d)" % sector_max_length, "varchar(%d)" % sector_max_length, "float" ] self.io_tables[year] = SQLTable(tablename, colnames, coltypes).create() self.io_tables[year].truncate()
def parse_env(): tables = {} for year in config.STUDY_YEARS: tablename = "%s.env_%d" % (config.WIOD_SCHEMA, year) colnames = ["country", "industry", "measurement", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(31)", "float"] tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() countries = sorted(config.countries.keys()) countries.append("ROW") # rest of world for (series, attribs) in config.env_series.items(): if "dir" in attribs: subdir = attribs["dir"] else: subdir = series subdir = os.path.join("wiod", subdir) skip_name = "skip_name" in attribs and attribs["skip_name"] for country in config.countries.keys(): filename = "%s_%s_May12.xls" % (country, series) print(filename) path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) for year in config.STUDY_YEARS: sheet = wb.sheet_by_name("%d" % year) measurements = sheet.row_values(0) if series == "EU": measurements = [m + " - Gross" for m in measurements] elif series == "CO2": measurements = ["CO2 - " + m for m in measurements] for i in range(1, sheet.nrows): row = sheet.row_values(i) if len(row[0].strip()): if skip_name: ind_code = row[0] first_col = 1 else: ind_name = row[0] ind_code = row[1] industry_tracker.set_code(ind_code, ind_name) first_col = 2 for j in range(first_col, len(row)): value = row[j] if type(value) is float and value != 0: measurement = measurements[j] tables[year].insert( [country, ind_code, measurement, value])
def parse_map(): table = SQLTable( "%s.code_map" % config.SCHEMA, ["from_code", "to_code", "env_code", "harmonized", "description"], ["varchar(3)", "varchar(6)", "varchar(31)", "char(3)", "text" ]).create() table.truncate() filename = "code_map.xls" path = fileutils.getdatapath(filename, "uk") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) def sanitize_code(code): if type(code) is float: code = str(int(code)) if not len(code): code = None return code for i in range(1, sheet.nrows): row = sheet.row_values(i) from_code = sanitize_code(row[0]) to_code = sanitize_code(row[2]) env_code = sanitize_code(row[4]) harmonized = sanitize_code(row[6]) desc = row[7].strip() table.insert([from_code, to_code, env_code, harmonized, desc])
def parse_codes(): comcodes = parserutils.add_tracker("%s.com_codes" % config.SCHEMA, "w") filename = fileutils.getdatapath("commodities.csv", "ca") with open(filename, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) and regexes.is_num(row[0]): comcodes.set_code(row[0], row[1]) comcodes.update_codes() maptable = SQLTable("%s.sector_map" % config.SCHEMA, ["io_code", "env_code", "harmonized"], ["varchar(15)", "varchar(15)", "varchar(15)"]).create() indcodes = parserutils.add_tracker("%s.ind_codes" % config.SCHEMA, "w") filename = fileutils.getdatapath("industries.csv", "ca") with open(filename, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) >= 5: io_code = row[0] if not len(io_code): io_code = None elif len(row[1]): indcodes.set_code(io_code, row[1]) env_code = row[2] if not len(env_code): env_code = None elif len(row[3]): indcodes.set_code(env_code, row[3]) harmonized = row[4] if len(harmonized) and regexes.is_num(harmonized): indcodes.set_code(harmonized, row[5]) maptable.insert([io_code, env_code, harmonized]) indcodes.update_codes()
def parse_codes(): path = fileutils.getdatapath("sector_map.xls", "tw-env") wb = xlrd.open_workbook(path) sheets = wb.sheets() for sheet in sheets: year = int(sheet.name) tablename = "%s.sector_map_%d" % (config.SCHEMA, year) if year > 2006: # CxI only, need map between commods and inds colnames = [ "io_sector", "env_sector", "harmonized_env", "io_commod", "io_ind" ] coltypes = ["varchar(255)"] * 5 else: colnames = ["io_sector", "env_sector", "harmonized_env"] coltypes = ["varchar(255)"] * 3 table = SQLTable(tablename, colnames, coltypes) table.create() table.truncate() for i in range(sheet.nrows): row = sheet.row_values(i) io_sector = row[0].strip() env_sector = row[1].strip() harmonized_env = row[2] if type(harmonized_env) is float: harmonized_env = str(int(harmonized_env)) harmonized_env = harmonized_env.strip() if year > 2006: io_commod = row[4].strip() io_ind = row[5].strip() table.insert( [io_sector, env_sector, harmonized_env, io_commod, io_ind]) else: table.insert([io_sector, env_sector, harmonized_env])
def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print("%d rows inserted" % insert_count)
def parse_sut(sheet_name, table_prefix): tables = {} colnames = ["country", "commodity", "industry", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] for year in config.STUDY_YEARS: tablename = "%s_%d" % (table_prefix, year) tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() for country in config.countries.keys(): # TODO: more automated way to get this if country in ("AUS", "DEU", "GBR", "USA"): filename = "%s_SUT_Feb12.xls" % country else: filename = "%s_SUT_Jan12.xls" % country subdir = os.path.join("wiod", "suts") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) # extract supply and use tables at fob prices sheet = wb.sheet_by_name(sheet_name) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) if not len(row[0].strip()): continue year = int(row[0]) if year not in config.STUDY_YEARS: continue com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first tables[year].insert( [country, com_code, ind_code, value])
def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print ("%d rows inserted" % insert_count)
def doparse(): table = SQLTable("%s.mdg_emissions" % config.UN_SCHEMA, ["country", "year", "value"], ["char(3)", "int", "float"]).create() table.truncate() country_dict = dict((v, k) for k, v in config.countries.items()) country_dict["Slovakia"] = "SVK" country_dict["Russian Federation"] = "RUS" year_pat = re.compile("[12]\d{3}") path = fileutils.getdatapath("mdg_emissions.csv", "un") with open(path, "r") as fh: csvf = csv.reader(fh) header = next(csvf) header_index = {} years = [] for i in range(len(header)): header_index[header[i]] = i if year_pat.match(header[i]): years.append(header[i]) for row in csvf: if len(row) <= header_index["SeriesCode"] or \ row[header_index["SeriesCode"]] != "749": continue country_name = row[header_index["Country"]] if country_name not in country_dict: continue country = country_dict[country_name] for year in years: value = row[header_index[year]].strip() if len(value): table.insert([country, int(year), float(value)])
def doparse(): # ppp rank from # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html countries = { "LUX": {"fips": "LU", "ppp": 3}, "USA": {"fips": "US", "ppp": 11}, "NLD": {"fips": "NL", "ppp": 17}, "AUT": {"fips": "AU", "ppp": 18}, "SWE": {"fips": "SW", "ppp": 21}, "CAN": {"fips": "CA", "ppp": 20}, "AUS": {"fips": "AS", "ppp": 22}, "IRL": {"fips": "EI", "ppp": 23}, "DEU": {"fips": "GM", "ppp": 26}, "TWN": {"fips": "TW", "ppp": 27}, "BEL": {"fips": "BE", "ppp": 28}, "DNK": {"fips": "DK", "ppp": 29}, "FIN": {"fips": "FI", "ppp": 32}, "GBR": {"fips": "UK", "ppp": 33}, "FRA": {"fips": "FR", "ppp": 35}, "JPN": {"fips": "JA", "ppp": 36}, "KOR": {"fips": "KS", "ppp": 40}, "ESP": {"fips": "SP", "ppp": 43}, "ITA": {"fips": "IT", "ppp": 44}, "CYP": {"fips": "CY", "ppp": 46}, "SVN": {"fips": "SI", "ppp": 47}, "CZE": {"fips": "EZ", "ppp": 50}, # EZ?? "GRC": {"fips": "GR", "ppp": 52}, "MLT": {"fips": "MT", "ppp": 53}, "PRT": {"fips": "PO", "ppp": 57}, "SVK": {"fips": "LO", "ppp": 58}, "POL": {"fips": "PL", "ppp": 60}, "EST": {"fips": "EN", "ppp": 61}, "HUN": {"fips": "HU", "ppp": 63}, "LTU": {"fips": "LH", "ppp": 65}, "RUS": {"fips": "RS", "ppp": 71}, "LVA": {"fips": "LG", "ppp": 75}, "MEX": {"fips": "MX", "ppp": 85}, "TUR": {"fips": "TU", "ppp": 86}, "BRA": {"fips": "BR", "ppp": 92}, "ROU": {"fips": "RO", "ppp": 97}, "BGR": {"fips": "BU", "ppp": 101}, "CHN": {"fips": "CH", "ppp": 121}, "IDN": {"fips": "ID", "ppp": 156}, "IND": {"fips": "IN", "ppp": 164}, } tablename = "world_supplement" table = SQLTable(tablename, ["year", "country", "pop", "gdp", "ppp"], ["int", "char(3)", "int", "float", "float"]).create() table.truncate() country_fips = {} data = {} for (country, info) in countries.items(): data[country] = {} country_fips[info["fips"]] = country # this file spec is documented in the xlsx file from the archive thisyear = datetime.datetime.now().year path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) if year >= thisyear: # we don't want future projections continue country = country_fips[fips] data[country][year] = {"pop": int(fields[2])} worldbank = { "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls", "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in countries: country = row[1] for (year, value) in zip(header, row[2:]): # this discards years where we don't have population if year in data[country] and \ type(value) is float and value != 0: data[country][year][indicator] = value for (country, country_data) in data.items(): for (year, year_data) in country_data.items(): ppp = None gdp = None pop = year_data["pop"] if "gdp" in year_data: gdp = year_data["gdp"] if "ppp" in year_data: ppp = year_data["ppp"] table.insert([year, country, pop, gdp, ppp])
def parse_codes(): # parse sector maps path = fileutils.getdatapath("io_env_map.xls", "jp") wb = xlrd.open_workbook(path) io_tables = {} env_tables = {} harmonized_sectors = {} harmonized_table = SQLTable("%s.harmonized_codes" % config.SCHEMA, ["code", "description"], ["char(3)", "varchar(63)"]).create() for year in config.STUDY_YEARS: # all io codes are in one sheet, parse afterward io_tables[year] = SQLTable( "%s.io_map_%d" % (config.SCHEMA, year), ["io_sector", "description", "harmonized"], ["char(3)", "varchar(63)", "char(3)"]).create() io_tables[year].truncate() # parse env codes env_table = SQLTable( "%s.env_map_%d" % (config.SCHEMA, year), ["env_sector", "description", "harmonized"], ["varchar(7)", "varchar(63)", "char(3)"]).create() env_table.truncate() sheet = wb.sheet_by_name(str(year)) for i in range(1, sheet.nrows): row = sheet.row_values(i) code = row[0] if type(code) is float: # 2005 codes are 5 or more digits so this just trims .0 code = str(int(code)).rjust(3, "0") desc = row[1] h_code = row[2] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") env_table.insert([code, desc, h_code]) if h_code not in harmonized_sectors: h_desc = row[3] harmonized_sectors[h_code] = 1 harmonized_table.insert([h_code, h_desc]) sheet = wb.sheet_by_name("io") positions = {} header = sheet.row_values(0) for i in range(len(header)): if type(header[i]) is float: positions[int(header[i])] = i elif header[i] == "harmonized": positions["harmonized"] = i for i in range(1, sheet.nrows): row = sheet.row_values(i) for year in config.STUDY_YEARS: code = row[positions[year]] if type(code) is float: code = str(int(code)).rjust(3, "0") if code is None or not len(code): continue desc = row[positions[year] + 1] h_code = row[positions["harmonized"]] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") io_tables[year].insert([code, desc, h_code])
def create_table(self, tablename, cols, coltypes, cascade=False): self.flush() self.table = SQLTable(tablename, cols, coltypes) self.table.drop(cascade) self.table.create() self.warmup()
def parse_io(): io_files = { 1996: "410281134571.xls", 1999: "4102715414971.xls", 2001: "4122111363671.xls", 2004: "611239581071.xls", 2006: "9121414285971.xls", 2007: "1139203871.xls", 2008: "1139204871.xls", 2009: "11229101502.xls", 2010: "1122910141371.xls", } for (year, filename) in io_files.items(): tablename = "%s.io_%d" % (config.SCHEMA, year) # millions are in NTD table = SQLTable(tablename, ["from_sector", "to_sector", "millions"], ["varchar(255)", "varchar(255)", "float"]) table.create() table.truncate() path = fileutils.getcache(filename, "tw/%d" % year) wb = xlrd.open_workbook(path) sheet = wb.sheets()[0] to_codes = sheet.row_values(0) to_names = sheet.row_values(1) for rowindex in range(2, sheet.nrows): row = sheet.row_values(rowindex) from_code = row[0].strip() from_name = row[1].strip() for i in range(2, len(to_names)): to_name = to_names[i].strip() value = row[i] table.insert([from_name, to_name, value]) if year == 2010: strings = { "viewname": "%s.io_view_%d" % (config.SCHEMA, year), "tablename": tablename, "maptable": "%s.sector_map_%d" % (config.SCHEMA, year), "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]), "from_blacklist": sqlhelper.set_repr(config.from_blacklists[year]), } sql = """CREATE OR REPLACE VIEW %(viewname)s AS SELECT from_map.io_sector AS from_sector, to_map.io_sector as to_sector, sum(millions) as millions FROM %(tablename)s io, (SELECT DISTINCT io_sector, io_commod FROM %(maptable)s) from_map, (SELECT DISTINCT io_sector, io_ind FROM %(maptable)s) to_map WHERE io.to_sector NOT IN %(to_blacklist)s AND io.from_sector NOT IN %(from_blacklist)s AND from_map.io_commod = io.from_sector AND to_map.io_ind = io.to_sector GROUP BY from_map.io_sector, to_map.io_sector""" % strings print(sql) db.execute(sql)
def doparse(): for year in (1972, 1977): table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year), ["code", "description"], ["char(6)", "text"]).create() table.truncate() filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa") with open(filepath, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) and len(row[0]): table.insert([row[0], row[1]]) if year == 1972: # this is stated in the rtf file for both 1972 and 1977 # but this code never appears in 1977, the documentation # was probably not properly updated table.insert(["870000", "total value added"]) writer = dbsetup.IOCodeTableWriter() writer.set_year(1982, "Io-code.doc") with open(writer.get_filename()) as f: for line in f: if len(line) > 8: code = line[:6] desc = line[8:] writer.writerow(code, desc) writer.set_year(1987, "SIC-IO.DOC") with open(writer.get_filename()) as f: pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)') for line in f: match = pattern.match(line) if match: code = match.group(1).rjust(2, '0') + match.group(2) desc = match.group(3).strip('(. \r\n') writer.writerow(code, desc) writer.set_year(1992, "io-code.txt") with open(writer.get_filename()) as f: for line in f: if len(line) > 7: code = line[:6] desc = line[7:] writer.writerow(code, desc) writer.set_year(1997, "IO-CodeDetail.txt") with open(writer.get_filename()) as f: csvf = csv.reader(f) for row in csvf: if len(row) == 2: writer.writerow(row[0], row[1]) writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt") with open(writer.get_filename()) as f: valid_line = re.compile("[A-Z0-9]{6}\s") line = f.readline().strip().replace("GasPipeVal", "GasPipe ") fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line)) codemap = {} for line in f: if valid_line.match(line): row = dbsetup.get_values_for_fields(dbsetup.replace_tabs(line), fields) codemap[row["Commodity"]] = row["CommodityDescription"] codemap[row["Industry"]] = row["IndustryDescription"] for (code, desc) in codemap.items(): writer.writerow(code, desc) writer.flush()
class IOTableStateTracker(TableStateTracker): def __init__(self): TableStateTracker.__init__(self) self.make_table = None self.use_table = None self.make_insert_count = 0 self.use_insert_count = 0 def flush(self): TableStateTracker.flush(self) if self.make_insert_count: print("%d rows inserted to make table" % self.make_insert_count) self.make_insert_count = 0 if self.use_insert_count: print("%d rows inserted to use table" % self.use_insert_count) self.use_insert_count = 0 def create_make_table(self, year): print("creating make table for %s..." % year) tablename = "%s.make_%s" % (config.IO_SCHEMA, year) self.make_table = SQLTable(tablename, ["industry", "commodity", "thousands"], ["varchar(6)", "varchar(6)", "bigint"]) self.make_table.create() self.make_table.truncate() def create_use_table(self, year, has_margins=False): print("creating use table for %s..." % year) cols = ["commodity", "industry", "thousands"] coltypes = ["varchar(6)", "varchar(6)", "bigint"] if has_margins: for field in bea.use_table_margins: cols.append(field) coltypes.append("int") tablename = "%s.use_%s" % (config.IO_SCHEMA, year) self.use_table = SQLTable(tablename, cols, coltypes) self.use_table.create() self.use_table.truncate() def insert_make(self, indus, commod, makeval, factor=1): value = float(makeval) * factor if (value != 0): self.make_table.insert([indus.strip(), commod.strip(), int(value)]) self.make_insert_count += 1 def insert_use(self, commod, indus, useval, margins={}, factor=1): useval = float(useval) * factor nonzero = useval values = [commod.strip(), indus.strip(), int(useval)] if len(margins) > 0: for margin_field in bea.use_table_margins: value = 0 if margin_field in margins: value = float(margins[margin_field]) * factor if value: nonzero += value values.append(value) if nonzero != 0: self.use_table.insert(values) self.use_insert_count += 1 # this is for years with no distinction between # make and use tables def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print("%d rows inserted" % insert_count) # this is for years that have make and use but no margins def create_simple_make_use(self, year, filename, factor=1): self.create_make_table(year) self.create_use_table(year, has_margins=False) with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) == 4: input_ind = cols[0] # comm consumed (producing ind) output_ind = cols[1] # consuming ind (comm produced) use_dollars = cols[2] # use in producers' prices make_dollars = cols[3] # make in producers' prices self.insert_make(input_ind, output_ind, make_dollars, factor) self.insert_use(commod=input_ind, indus=output_ind, useval=use_dollars, factor=factor)
class SectorCodes: def __init__(self, codetablename, mode="r"): self.mode = mode self.codetable = SQLTable(codetablename, ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = {} self.reverse_code_dict = {} self.setup() def setup(self): if self.mode == "w": # invalid codes or codes that we don't want to record self.code_blacklist = [] # if we want to override the code provided with something # we make up (or from another set) based on the description self.manual_codes = {} self.codetable.create() # get existing codes from db for (code, desc) in self.codetable.getall(): self.code_dict[code] = desc self.reverse_code_dict[desc] = code return self # for write mode def blacklist_code(self, code): self.code_blacklist.append(code) if code in self.code_dict: del self.code_dict[code] def set_blacklist(self, code_blacklist): self.code_blacklist = [] for code in code_blacklist: self.blacklist_code(code) def curate_code_from_desc(self, desc, code): self.manual_codes[desc] = code self.code_dict[code] = desc self.reverse_code_dict[desc] = code def add_curated_codes(self, curated_codes): for (desc, code) in curated_codes.items(): self.curate_code_from_desc(desc, code) # returns the code used if it was recognized, false otherwise def set_code(self, code, desc): if type(code) is str: code = code.strip() elif type(code) is float: code = str(int(code)) if type(desc) is str: desc = desc.strip() if desc in self.manual_codes: code = self.manual_codes[desc] if code is None or not len(code): if desc is None or not len(desc): # ignore empty args return False else: return False elif code in self.code_blacklist: return False if code in self.code_dict and self.code_dict[code] != desc: # this is to check for blatant differences print(self.code_dict[code], "=>", desc) self.code_dict[code] = desc # there may be more than one description for the same code self.reverse_code_dict[desc] = code return code def has_code(self, code): return code in self.code_dict def get_code_for_title(self, desc): if desc in self.reverse_code_dict: return self.reverse_code_dict[desc] def get_title_for_code(self, code): if self.has_code(code): return self.code_dict[code] return False def update_codes(self): if self.mode != "w": raise Exception("SectorCodes created in read-only mode") self.codetable.truncate() for code in sorted(self.code_dict.keys()): desc = self.code_dict[code] self.codetable.insert([code, desc])
def parse_codes(): # parse sector maps path = fileutils.getdatapath("io_env_map.xls", "jp") wb = xlrd.open_workbook(path) io_tables = {} env_tables = {} harmonized_sectors = {} harmonized_table = SQLTable( "%s.harmonized_codes" % config.SCHEMA, ["code", "description"], ["char(3)", "varchar(63)"]).create() for year in config.STUDY_YEARS: # all io codes are in one sheet, parse afterward io_tables[year] = SQLTable( "%s.io_map_%d" % (config.SCHEMA, year), ["io_sector", "description", "harmonized"], ["char(3)", "varchar(63)", "char(3)"]).create() io_tables[year].truncate() # parse env codes env_table = SQLTable( "%s.env_map_%d" % (config.SCHEMA, year), ["env_sector", "description", "harmonized"], ["varchar(7)", "varchar(63)", "char(3)"]).create() env_table.truncate() sheet = wb.sheet_by_name(str(year)) for i in range(1, sheet.nrows): row = sheet.row_values(i) code = row[0] if type(code) is float: # 2005 codes are 5 or more digits so this just trims .0 code = str(int(code)).rjust(3, "0") desc = row[1] h_code = row[2] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") env_table.insert([code, desc, h_code]) if h_code not in harmonized_sectors: h_desc = row[3] harmonized_sectors[h_code] = 1 harmonized_table.insert([h_code, h_desc]) sheet = wb.sheet_by_name("io") positions = {} header = sheet.row_values(0) for i in range(len(header)): if type(header[i]) is float: positions[int(header[i])] = i elif header[i] == "harmonized": positions["harmonized"] = i for i in range(1, sheet.nrows): row = sheet.row_values(i) for year in config.STUDY_YEARS: code = row[positions[year]] if type(code) is float: code = str(int(code)).rjust(3, "0") if code is None or not len(code): continue desc = row[positions[year] + 1] h_code = row[positions["harmonized"]] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") io_tables[year].insert([code, desc, h_code])
def create_table(self, tablename, cols, coltypes, cascade=False): table = SQLTable(tablename, cols, coltypes) table.drop(cascade) table.create() self.tables[tablename] = table
def drop_table(self, tablename, cascade=False): self.table = SQLTable(tablename) self.table.drop(cascade)
def doparse(): tablename = "%s.world_supplement" % config.WIOD_SCHEMA table = SQLTable(tablename, ["year", "country", "measurement", "value"], ["int", "char(3)", "varchar(8)", "float"]) table.create() table.truncate() # census data has more complete population counts country_fips = { "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE", "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL", "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN", "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE", "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL", "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX", "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN", "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA", } # this file spec is documented in the xlsx file from the archive path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) country = country_fips[fips] table.insert([year, country, "pop", int(fields[2])]) # worldbank data has some deflator data that imf doesn't worldbank = { "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls", #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls", #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls", "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in config.countries: country = row[1] for (year, value) in zip(header, row[2:]): if type(value) is float and value != 0: table.insert([year, country, indicator, value]) imf_fields = ( "LP", # population "PPPPC", # ppp per capita "NGDPRPC", # gdp per capita in constant prices "NGDP_D", # gdp deflator ) # this is actually a csv file despite what it's called path = fileutils.getcache("WEOApr2012all.xls", "wsupp") with codecs.open(path, "r", "cp1252") as fh: csvf = csv.reader(fh, dialect=csv.excel_tab) header = next(csvf) year_cols = {} valid_year = re.compile("\d{4}") valid_float = re.compile("-*[\d\.,]+") for i in range(len(header)): if header[i] == "ISO": country_col = i elif header[i] == "WEO Subject Code": subject_col = i elif valid_year.match(header[i]): year_cols[int(header[i])] = i elif header[i] == "Estimates Start After": last_year_col = i for row in csvf: if len(row) > subject_col and row[subject_col] in imf_fields: field = row[subject_col] country = row[country_col] if country not in config.countries: continue if valid_year.match(row[last_year_col]): last_year = int(row[last_year_col]) else: # not clear if this means all values are estimated last_year = 9999 for (year, colnum) in year_cols.items(): value = row[colnum] if valid_float.match(value): #and year < last_year: table.insert([ year, country, field, float(value.replace(",", "")) ])
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable("cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column+1:], rowdata[max_sector_column+1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append(-abs( abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1]*len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i-1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len(columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
def parse_int(): for year in config.STUDY_YEARS: tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year) colnames = [ "from_country", "to_country", "commodity", "industry", "value" ] coltypes = [ "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float" ] use_table = SQLTable(tablename, colnames, coltypes).create() tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year) colnames = ["country", "industry", "commodity", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] make_table = SQLTable(tablename, colnames, coltypes).create() filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4] subdir = os.path.join("wiod", "intsuts_analytic") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) for country in config.countries.keys(): sheet = wb.sheet_by_name("USE_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) # notes say Use tables are broken down by origin from_country = row[1] # stupid hack so i don't have to change char(3) if from_country == "ZROW": from_country = "RoW" com_code = commodity_tracker.set_code(row[2], row[3]) if not com_code: continue for j in range(4, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first use_table.insert( [from_country, country, com_code, ind_code, value]) sheet = wb.sheet_by_name("SUP_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # industry first make_table.insert([country, ind_code, com_code, value])
def parse_env(): # parse english env files # TODO: might want to use the energy table as well. # it is very comprehensive, but formatted differently and only has 2001 sector_whitelist = ("Household Consumption", "Fixed Capital Formation") eng_env_years = [1999, 2001, 2004] eng_env_files = { "air_pol": { "filename": "IO_air.xls", "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"], }, "water_pol": { "filename": "IO_pol_water.xls", "columns": ["BOD", "COD", "SS"], }, "waste_pol": { "filename": "IO_waste.xls", "columns": [ "Total waste", "General waste", "Hazardous waste", "Total waste - improper disposal", "General waste - improper disposal", "Hazardous waste - improper disposal" ], }, "water_use": { "filename": "IO_res_water.xls", "columns": ["Natural water", "Abstracted water"], }, } tables_by_year = {} for year in eng_env_years: if year not in tables_by_year: tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() tables_by_year[year] = table else: table = tables_by_year[year] first_file = True for (tkey, tdata) in eng_env_files.items(): path = fileutils.getdatapath(tdata["filename"], "tw-env") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("year %d" % year) for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1 and \ (regexes.is_num(row[0]) or row[1] in sector_whitelist): sector = row[1].strip() if first_file: # these columns are repeated in every file table.insert([sector, "Total Output", row[2]]) table.insert([sector, "Total Input", row[3]]) table.insert([sector, "GDP", row[4]]) first_file = False for i in range(len(tdata["columns"])): table.insert([sector, tdata["columns"][i], row[i + 5]]) # parse chinese env tables # this is file that we created by compiling older chinse data and # manually copying info from latest (2010) pdf files # skip 2001 because the english version is better sheetnames_by_year = { 2000: ["89年空汙", "89年廢棄物"], 2002: ["91年空汙", "91年廢棄物"], 2003: ["92年空汙", "92年廢棄物"], 2010: ["99年空汙", "99年水汙", "99年廢棄物"], } path = fileutils.getdatapath("sheets.xls", "tw-env") wb = xlrd.open_workbook(path) for (year, sheetnames) in sheetnames_by_year.items(): tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() for sheetname in sheetnames: sheet = wb.sheet_by_name(sheetname) header = sheet.row_values(0) # the 2010 tables have several rows that we don't want should_parse = (year != 2010) for i in range(1, sheet.nrows): row = sheet.row_values(i) if should_parse: sector = row[0].strip() for i in range(1, len(header)): measurement = header[i].strip() value = row[i] table.insert([sector, measurement, value]) elif row[0] in ("依行業分", "依部門分"): should_parse = True
def doparse(): carrier_countries = { #"-": "", # Unknown "1I": "USA", # Sky Trek International Airlines "2T": "CAN", # Canada 3000 Airlines Ltd. "3Z": "USA", # Tatonduk Outfitters Limited d/b/a Everts Air Alaska and Everts Air Cargo "5X": "USA", # United Parcel Service "5Y": "USA", # Atlas Air Inc. "6F": "GBR", # Laker Airways Inc. #"6U": "", # Air Ukraine #"6Y": "", # Nicaraguense De Aviacion Sa #"7P": "", # Apa International Air S.A. (dominican rep) #"7Z": "", # Lb Limited "8C": "USA", # Air Transport International "AA": "USA", # American Airlines Inc. "AC": "CAN", # Air Canada #"ADB": "", # Antonov Company (ukraine) "AF": "FRA", # Compagnie Nat'l Air France "AI": "IND", # National Aviation Company of India Limited d/b/a Air India "AM": "MEX", # Aeromexico #"AQQ": "", # Air Charter (Safa) #"AR": "", # Aerolineas Argentinas "AS": "USA", # Alaska Airlines Inc. #"AT": "", # Royal Air Maroc (morocco) #"AV": "", # Aerovias Nac'l De Colombia "AY": "FIN", # Finnair Oy "AZ": "ITA", # Compagnia Aerea Italiana #"All Rows": "", # All Rows (including those not displayed) "BA": "GBR", # British Airways Plc #"BBQ": "", # Balair Ag (swiss) "BCQ": "CAN", # Bradley Air Services Ltd. #"BG": "", # Biman Bangladesh Airlines "BQ": "MEX", # Aeromar C. Por A. "BR": "TWN", # Eva Airways Corporation #"BW": "", # Caribbean Airlines Limited (trinidad and tobago) "BY": "GBR", # Britannia Airways Ltd. "CA": "CHN", # Air China #"CC": "", # Air Atlanta Icelandic "CDQ": "USA", # Kitty Hawk International #"CF": "", # Compan. De Aviacion Faucett (peru) "CI": "TWN", # China Airlines Ltd. #"CLQ": "", # Aero Transcolombiana #"CM": "", # Compania Panamena (Copa) "CO": "USA", # Continental Air Lines Inc. "CP (1)": "CAN", # Canadian Airlines International Ltd. "CS": "USA", # Continental Micronesia "CV": "LUX", # Cargolux Airlines International S.A #"CVQ": "", # Caraven S.A. #"CX": "", # Cathay Pacific Airways Ltd. (hong kong, includes pre 1997) "CYQ": "FRA", # Corse Air International (assuming corsair) "CZ": "CHN", # China Southern Airlines "DE": "DEU", # Condor Flugdienst "DHQ": "GBR", # DHL Aero Expresso "DL": "USA", # Delta Air Lines Inc. #"ED": "", # Andes (ecuador or argentina) "EH": "ESP", # Saeta Airlines "EI": "IRL", # Aer Lingus Plc #"EOQ": "", # Aeroservicios Ecuatorianos "ER": "USA", # Astar USA, LLC #"EU": "", # Ecuatoriana De Aviacion #"EXQ": "", # Export Air Del Peru S.A. "EZ": "TWN", # Evergreen International Inc. "F9": "USA", # Frontier Airlines Inc. "FCQ": "USA", # Falcon Air Express #"FF": "", # Tower Air Inc. #"FI": "", # Icelandair #"FJ": "", # Air Pacific Ltd. (fiji) "FNQ": "USA", # Fine Airlines Inc. #"FQ": "", # Air Aruba #"FS": "", # Serv De Trans Aereos Fuegui (argentina) "FX": "USA", # Federal Express Corporation #"G3": "", # Aerochago S.A. "GA": "IDN", # P.T. Garuda Indonesian Arwy "GD": "MEX", # Transp. Aereos Ejecutivos #"GF": "", # Gulf Air Company (bahrain) #"GH": "", # Ghana Airways Corporation "GJ (1)": "MEX", # Mexicargo "GL": "USA", # Miami Air International "GR": "USA", # Gemini Air Cargo Airways #"GU": "", # Aviateca (guatemala) #"GY": "", # Guyana Airways Corporation "H2": "BEL", # City Bird "H5": "RUS", # Magadan Airlines "HA": "USA", # Hawaiian Airlines Inc. "HAQ": "DEU", # Hapag Lloyd Flug. "HCQ": "USA", # Av Atlantic #"HFQ": "", # Haiti Air Freight Intl "HLQ": "AUS", # Heavylift Cargo Airlines Lt "HP": "USA", # America West Airlines Inc. (Merged with US Airways 9/05. Stopped reporting 10/07.) #"HY": "", # Uzbekistan Airways "IB": "ESP", # Iberia Air Lines Of Spain #"ITQ": "", # Interamericana De Aviacion (uruguay) "IW": "FRA", # Air Liberte Aka Aom Minerve #"JAQ": "", # Jamaica Air Freighters "JD": "JPN", # Japan Air System Co. Ltd. "JI (1)": "USA", # Midway Airlines Inc. "JK": "ESP", # Spanair S.A. "JKQ": "USA", # Express One International Inc. "JL": "JPN", # Japan Air Lines Co. Ltd. #"JM": "", # Air Jamaica Limited "JR": "USA", # Aero California "JW": "CAN", # Arrow Air Inc. "JZ": "JPN", # Japan Air Charter Co. Ltd. "K8 (1)": "NLD", # Dutch Caribbean Airlines "KE": "KOR", # Korean Air Lines Co. Ltd. "KH": "USA", # Aloha Air Cargo #"KI": "", # Time Air Ltd. (south africa) "KL": "NLD", # Klm Royal Dutch Airlines #"KP": "", # Kiwi International "KR": "USA", # Kitty Hawk Aircargo "KTQ": "TUR", # Turks Air Ltd. #"KU": "", # Kuwait Airways Corp. "KW": "USA", # Carnival Air Lines Inc. #"KX": "", # Cayman Airways Limited "KZ": "JPN", # Nippon Cargo Airlines #"LA": "", # Lan-Chile Airlines #"LB": "", # Lloyd Aereo Boliviano S. A. "LGQ": "MEX", # Lineas Aereas Allegro "LH": "DEU", # Lufthansa German Airlines "LO": "POL", # Polskie Linie Lotnicze #"LR": "", # Lacsa (costa rica) #"LSQ": "", # Lineas Aereas Suramerican (colombia) "LT": "DEU", # Luftransport-Unternehmen #"LU": "", # Air Atlantic Dominicana #"LY": "", # El Al Israel Airlines Ltd. "LZ": "BGR", # Balkan Bulgarian Airlines "M6": "USA", # Amerijet International "M7": "MEX", # Aerotransportes Mas De Crga "MA": "HUN", # Malev Hungarian Airlines "MG": "USA", # Champion Air #"MH": "", # Malaysian Airline System #"ML": "", # Aero Costa Rica "MP": "NLD", # Martinair Holland N.V. #"MS": "", # Egyptair "MT": "GBR", # Thomas Cook Airlines Uk Ltd. "MT (1)": "GBR", # Flying Colours Airlines Ltd. "MU": "CHN", # China Eastern Airlines #"MUQ": "", # Aerolineas Mundo (columbia) "MX": "MEX", # Compania Mexicana De Aviaci #"MYQ": "", # Lineas Aereas Mayas (Lamsa) #"N5 (1)": "", # Nations Air Express Inc. "NA": "USA", # North American Airlines "NG": "DEU", # Lauda Air Luftfahrt Ag "NH": "JPN", # All Nippon Airways Co. "NK": "USA", # Spirit Air Lines "NW": "USA", # Northwest Airlines Inc. "NWQ": "USA", # N. W. Territorial Airways #"NZ": "", # Air New Zealand "OA": "GRC", # Olympic Airways #"OI": "", # Prestige Airways (uae) "OK": "CZE", # Czech Airlines #"ON": "", # Air Nauru "OS": "AUT", # Austrian Airlines "OW": "USA", # Executive Airlines "OZ": "KOR", # Asiana Airlines Inc. "PA (2)": "USA", # Pan American World Airways "PCQ": "USA", # Pace Airlines #"PIQ": "", # Pacific International Airlines (ambiguous: usa, panama) #"PK": "", # Pakistan International Airlines #"PL": "", # Aero Peru "PNQ": "USA", # Panagra Airways "PO": "USA", # Polar Air Cargo Airways #"PR": "", # Philippine Airlines Inc. "PRQ": "USA", # Florida West Airlines Inc. "PT": "USA", # Capital Cargo International #"PY": "", # Surinam Airways Limited "Q7": "BEL", # Sobelair "QF": "AUS", # Qantas Airways Ltd. "QK": "CAN", # Jazz Aviation LP #"QN": "", # Royal Air (ambiguous) "QO": "MEX", # Aeromexpress "QQ": "USA", # Reno Air Inc. #"QT": "", # Transportes Aereos Mercantiles Panamericanos S.A (colombia) "QTQ": "IRL", # Aer Turas Teoranta "QX": "USA", # Horizon Air "RD": "USA", # Ryan International Airlines "REQ": "USA", # Renown Aviation "RG": "BRA", # Varig S. A. #"RJ": "", # Alia-(The) Royal Jordanian #"RK": "", # Air Afrique "RNQ": "GBR", # Mytravel Airways "RO": "ROU", # Tarom Romanian Air Transpor #"SA": "", # South African Airways "SAQ": "USA", # Southern Air Transport Inc. "SEQ": "GBR", # Sky Service F.B.O. "SIQ": "LUX", # Premiair "SK": "SWE", # Scandinavian Airlines Sys. "SM": "USA", # Sunworld International Airlines "SN (1)": "BEL", # Sabena Belgian World Air. "SPQ": "USA", # Sun Pacific International #"SQ": "", # Singapore Airlines Ltd. #"SR": "", # Swissair Transport Co. Ltd. "SU": "RUS", # Aeroflot Russian Airlines #"SV": "", # Saudi Arabian Airlines Corp "SX (1)": "MEX", # Aeroejecutivo S.A. "SY": "USA", # Sun Country Airlines d/b/a MN Airlines "T9": "USA", # TransMeridian Airlines #"TA": "", # Taca International Airlines (el savador) "TCQ": "USA", # Express.Net Airlines #"TG": "", # Thai Airways International Ltd. "TK": "TUR", # Turk Hava Yollari A.O. "TKQ": "USA", # Trans-Air-Link Corporation "TNQ": "USA", # Emery Worldwide Airlines "TP": "PRT", # Tap-Portuguese Airlines "TR": "BRA", # Transbrasil S.A. "TRQ": "SWE", # Blue Scandinavia Ab "TS": "CAN", # Air Transat "TW": "USA", # Trans World Airways LLC #"TZ": "", # ATA Airlines d/b/a ATA (iran) "TZQ": "GBR", # First Choice Airways "U7": "USA", # USA Jet Airlines Inc. "UA": "USA", # United Air Lines Inc. #"UD": "", # Fast Air Carrier Ltd. "UN": "RUS", # Transaero Airlines #"UP": "", # Bahamasair Holding Limited "US": "USA", # US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.) "UX": "ESP", # Air Europa #"UYQ": "", # Aerolineas Uruguayas S.A. #"VA (1)": "", # Venezuelan International Airways #"VC": "", # Servicios Avensa (venezuela) #"VE": "", # Aerovias Venezolanas-Avensa "VIQ": "RUS", # Volga-Dnepr Airlines "VP": "BRA", # Viacao Aerea Sao Paulo #"VR": "", # Transportes Aereos De Cabo (cape verde) "VS": "GBR", # Virgin Atlantic Airways #"VX (1)": "", # Aces Airlines (colombia) #"W7": "", # Western Pacific Airlines (solomon islands) #"WD": "", # Halisa Air (haiti) "WE": "USA", # Centurion Cargo Inc. "WO": "USA", # World Airways Inc. #"XC": "", # Air Caribbean (1) "XE": "USA", # ExpressJet Airlines Inc. (1) "XJ": "USA", # Mesaba Airlines "XP": "USA", # Casino Express "YX (1)": "USA", # Midwest Airline, Inc. "ZB": "USA", # Monarch Airlines #"ZUQ": "", # Zuliana De Aviacion (venezuela) "ZX (1)": "CAN", # Airbc Ltd. } tablename = "air_carriers" table = SQLTable( tablename, ["year", "carrier", "series", "value"], ["int", "varchar(15)", "varchar(15)", "int"]) table.create() table.truncate() carriers = {} for year in config.STUDY_YEARS: for filestem in ["freight", "passengers"]: filename = filestem + str(year) + ".csv" path = fileutils.getcache(filename, "bts") with open(path) as fh: csvf = csv.reader(fh) next(csvf) header = next(csvf) for row in csvf: if len(row) == 3: carrier = row[0] #carrier_name = row[1] if carrier in carrier_countries: country = carrier_countries[carrier] value = int(row[2]) table.insert([year, country, filestem, value])
def doparse(): for year in (1972, 1977): table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year), ["code", "description"], ["char(6)", "text"]).create() table.truncate() filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa") with open(filepath, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) and len(row[0]): table.insert([row[0], row[1]]) if year == 1972: # this is stated in the rtf file for both 1972 and 1977 # but this code never appears in 1977, the documentation # was probably not properly updated table.insert(["870000", "total value added"]) writer = dbsetup.IOCodeTableWriter() writer.set_year(1982, "Io-code.doc") with open(writer.get_filename()) as f: for line in f: if len(line) > 8: code = line[:6] desc = line[8:] writer.writerow(code, desc) writer.set_year(1987, "SIC-IO.DOC") with open(writer.get_filename()) as f: pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)') for line in f: match = pattern.match(line) if match: code = match.group(1).rjust(2, '0') + match.group(2) desc = match.group(3).strip('(. \r\n') writer.writerow(code, desc) writer.set_year(1992, "io-code.txt") with open(writer.get_filename()) as f: for line in f: if len(line) > 7: code = line[:6] desc = line[7:] writer.writerow(code, desc) writer.set_year(1997, "IO-CodeDetail.txt") with open(writer.get_filename()) as f: csvf = csv.reader(f) for row in csvf: if len(row) == 2: writer.writerow(row[0], row[1]) writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt") with open(writer.get_filename()) as f: valid_line = re.compile("[A-Z0-9]{6}\s") line = f.readline().strip().replace("GasPipeVal", "GasPipe ") fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line)) codemap = {} for line in f: if valid_line.match(line): row = dbsetup.get_values_for_fields( dbsetup.replace_tabs(line), fields) codemap[row["Commodity"]] = row["CommodityDescription"] codemap[row["Industry"]] = row["IndustryDescription"] for (code, desc) in codemap.items(): writer.writerow(code, desc) writer.flush()
def parse_env(): cache_dirs = fileutils.getcachecontents("cn") for adir in cache_dirs: if regexes.is_num(adir): year = int(adir) else: continue db_table = SQLTable( "cn.emissions_%d" % year, ["industry_zh", "industry_en", "pollutant", "amount"], ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"]) db_table.drop() db_table.create() def insert_row(rowdata, columns, max_sector_column): if max_sector_column == 0: (ind_zh, ind_en) = split_english(rowdata[0]) else: ind_zh = rowdata[0] ind_en = rowdata[1] for (pollutant, amount) in zip(columns[max_sector_column + 1:], rowdata[max_sector_column + 1:]): if (len(amount)): db_table.insert([ind_zh, ind_en, pollutant, amount]) xact = db.xact(mode="READ WRITE") xact.begin() subdir = os.path.join("cn", adir) files = fileutils.getcachecontents(subdir) for filename in files: filepath = fileutils.getcache(filename, subdir) fh = open(filepath, "rb") # binary b/c of non-utf encoding html = fh.read() fh.close() soup = BeautifulSoup(html) print(adir, filename) title = soup.title.string # mad maaad nested tables! # we'll just have to find one with a large number of rows # and hope that's the right one table = None for test_table in soup.find_all("table"): if test_table.tbody: test_table = test_table.tbody num_rows = len(list(filter(is_row, test_table.children))) if num_rows > 10: table = test_table break columns = None did_have_numbers = False # true after we've parsed through max_sector_column = 0 # 1 if english separate, 0 otherwise prev_rowdata = None prev_rowspans = None data = [] # long cell values are often expanded into the cell directly # below (multiple rows) resulting in rows that are blank # except in cells that contain overflow. # this necessitates to keep state using heuristics. insert_later = None insert_now = None for row in table.children: if not is_tag(row) or row.name != "tr": continue rowspans = [] rowdata = [] # multi-row cells precede sub-parts of the pollutant # which can't be distinguished without their parent prefix = None cells = list(filter(is_cell, row.children)) rowlen = len(cells) for cellpos in range(rowlen): cell = cells[cellpos] rowspan = 1 if "rowspan" in cell.attrs: rowspan = int(cell["rowspan"]) cellvalue = cell.text.strip().strip(".")\ .replace('…', '').replace('\xa0', '') # use previous rowspan if we have one of the buggy blank # cells at the end, which don't have the proper rowspan if cellpos == rowlen - 1 and \ len(cellvalue) == 0 and len(rowspans) > 0: rowspan = rowspans[-1] # if the cell directly before us in the previous row # spanned multiple rows, create a blank space in this row. # the abs difference below is used for counting down: # if rowspan in previous column was 6 and current is 1 # the difference is -5, on the next row that will # be subtracted again if prev_rowspans is not None: i = len(rowdata) while i < len(prev_rowspans) and \ abs(prev_rowspans[i]) > rowspan: rowdata.append('') rowspans.append( -abs(abs(rowspan) - abs(prev_rowspans[i]))) i = len(rowdata) rowdata.append(cellvalue) rowspans.append(rowspan) # count any multi-row cells that were at the end if prev_rowdata is not None: for i in range(len(rowdata), len(prev_rowdata)): if prev_rowspans[i] > rowspan: # span of last cell rowdata.append(prev_rowdata[i]) rowspans.append(rowspan) # remove blank cells at the end - these appear to be bugs while len(rowdata) and len(rowdata[-1]) == 0 and \ (columns is None or len(rowdata) != len(columns)): rowdata.pop() rowspans.pop() # end of rowdata manipulation prev_rowdata = rowdata prev_rowspans = rowspans if len(rowdata) == 0: continue # ignore rows that they put above the column headers # we'll just special case anything we find if columns is None and rowdata[0].startswith("单位"): prev_rowdata = None prev_rowspans = None continue lengths = [len(x) for x in rowdata] if sum(lengths) == 0: # all blank strings continue # if we're sure we have columns, clean up rowdata so # the multirow rules don't get applied anymore if sum(rowspans) == rowspan * len(rowspans): rowspans = [1] * len(rowspans) has_numbers = False for field in rowdata: if regexes.is_num(field): has_numbers = True did_have_numbers = True break if has_numbers or insert_later is None: insert_now = insert_later insert_later = rowdata else: # decide whether this row is an overflow # already know sum(lengths) > 0 if len(rowdata) >= len(insert_later) and \ (lengths[0] == 0 or lengths[-1] == 0): # we shouldn't see overflow on both sides # because rowdata[0] should happen in a header row # and rowdata[-1] must happen in a data row for i in range(len(insert_later)): # don't want to append to "hang ye" or "Sector" if not did_have_numbers \ and i > max_sector_column + 1 \ and len(insert_later[i]) == 0: # blank above, assume "multirow" to the left insert_later[i] = insert_later[i - 1] + " - " if lengths[i]: insert_later[i] += " " + rowdata[i] # if we knocked blank cells off the previous row but # we know it's actually longer from the current row for i in range(len(insert_later), len(rowdata)): insert_later.append(rowdata[i]) #if not has_numbers and not did_have_numbers: # near BOF if insert_now is not None and columns is None: columns = insert_now insert_now = None for i in range(len(columns)): columns[i] = columns[i].replace("\n", " ") # figure out if english names are separate or not if len(columns) > 1 and columns[1].strip() == "Sector": max_sector_column = 1 elif insert_now is not None and len(insert_now) == len( columns): insert_row(insert_now, columns, max_sector_column) insert_now = None else: # we don't want to get here - debug if insert_now is not None: print(len(insert_now), len(columns), insert_now) # close the loop if insert_later is not None and len(insert_later) == len(columns): insert_row(insert_later, columns, max_sector_column) print(columns) xact.commit()
class SectorCodes: def __init__(self, codetablename, mode="r"): self.mode = mode self.codetable = SQLTable( codetablename, ["code", "description"], ["varchar(15)", "varchar(255)"]) self.code_dict = {} self.reverse_code_dict = {} self.setup() def setup(self): if self.mode == "w": # invalid codes or codes that we don't want to record self.code_blacklist = [] # if we want to override the code provided with something # we make up (or from another set) based on the description self.manual_codes = {} self.codetable.create() # get existing codes from db for (code, desc) in self.codetable.getall(): self.code_dict[code] = desc self.reverse_code_dict[desc] = code return self # for write mode def blacklist_code(self, code): self.code_blacklist.append(code) if code in self.code_dict: del self.code_dict[code] def set_blacklist(self, code_blacklist): self.code_blacklist = [] for code in code_blacklist: self.blacklist_code(code) def curate_code_from_desc(self, desc, code): self.manual_codes[desc] = code self.code_dict[code] = desc self.reverse_code_dict[desc] = code def add_curated_codes(self, curated_codes): for (desc, code) in curated_codes.items(): self.curate_code_from_desc(desc, code) # returns the code used if it was recognized, false otherwise def set_code(self, code, desc): if type(code) is str: code = code.strip() elif type(code) is float: code = str(int(code)) if type(desc) is str: desc = desc.strip() if desc in self.manual_codes: code = self.manual_codes[desc] if code is None or not len(code): if desc is None or not len(desc): # ignore empty args return False else: return False elif code in self.code_blacklist: return False if code in self.code_dict and self.code_dict[code] != desc: # this is to check for blatant differences print(self.code_dict[code], "=>", desc) self.code_dict[code] = desc # there may be more than one description for the same code self.reverse_code_dict[desc] = code return code def has_code(self, code): return code in self.code_dict def get_code_for_title(self, desc): if desc in self.reverse_code_dict: return self.reverse_code_dict[desc] def get_title_for_code(self, code): if self.has_code(code): return self.code_dict[code] return False def update_codes(self): if self.mode != "w": raise Exception("SectorCodes created in read-only mode") self.codetable.truncate() for code in sorted(self.code_dict.keys()): desc = self.code_dict[code] self.codetable.insert([code, desc])
def doparse(): # ppp rank from # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html countries = { "LUX": { "fips": "LU", "ppp": 3 }, "USA": { "fips": "US", "ppp": 11 }, "NLD": { "fips": "NL", "ppp": 17 }, "AUT": { "fips": "AU", "ppp": 18 }, "SWE": { "fips": "SW", "ppp": 21 }, "CAN": { "fips": "CA", "ppp": 20 }, "AUS": { "fips": "AS", "ppp": 22 }, "IRL": { "fips": "EI", "ppp": 23 }, "DEU": { "fips": "GM", "ppp": 26 }, "TWN": { "fips": "TW", "ppp": 27 }, "BEL": { "fips": "BE", "ppp": 28 }, "DNK": { "fips": "DK", "ppp": 29 }, "FIN": { "fips": "FI", "ppp": 32 }, "GBR": { "fips": "UK", "ppp": 33 }, "FRA": { "fips": "FR", "ppp": 35 }, "JPN": { "fips": "JA", "ppp": 36 }, "KOR": { "fips": "KS", "ppp": 40 }, "ESP": { "fips": "SP", "ppp": 43 }, "ITA": { "fips": "IT", "ppp": 44 }, "CYP": { "fips": "CY", "ppp": 46 }, "SVN": { "fips": "SI", "ppp": 47 }, "CZE": { "fips": "EZ", "ppp": 50 }, # EZ?? "GRC": { "fips": "GR", "ppp": 52 }, "MLT": { "fips": "MT", "ppp": 53 }, "PRT": { "fips": "PO", "ppp": 57 }, "SVK": { "fips": "LO", "ppp": 58 }, "POL": { "fips": "PL", "ppp": 60 }, "EST": { "fips": "EN", "ppp": 61 }, "HUN": { "fips": "HU", "ppp": 63 }, "LTU": { "fips": "LH", "ppp": 65 }, "RUS": { "fips": "RS", "ppp": 71 }, "LVA": { "fips": "LG", "ppp": 75 }, "MEX": { "fips": "MX", "ppp": 85 }, "TUR": { "fips": "TU", "ppp": 86 }, "BRA": { "fips": "BR", "ppp": 92 }, "ROU": { "fips": "RO", "ppp": 97 }, "BGR": { "fips": "BU", "ppp": 101 }, "CHN": { "fips": "CH", "ppp": 121 }, "IDN": { "fips": "ID", "ppp": 156 }, "IND": { "fips": "IN", "ppp": 164 }, } tablename = "world_supplement" table = SQLTable(tablename, ["year", "country", "pop", "gdp", "ppp"], ["int", "char(3)", "int", "float", "float"]).create() table.truncate() country_fips = {} data = {} for (country, info) in countries.items(): data[country] = {} country_fips[info["fips"]] = country # this file spec is documented in the xlsx file from the archive thisyear = datetime.datetime.now().year path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) if year >= thisyear: # we don't want future projections continue country = country_fips[fips] data[country][year] = {"pop": int(fields[2])} worldbank = { "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls", "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in countries: country = row[1] for (year, value) in zip(header, row[2:]): # this discards years where we don't have population if year in data[country] and \ type(value) is float and value != 0: data[country][year][indicator] = value for (country, country_data) in data.items(): for (year, year_data) in country_data.items(): ppp = None gdp = None pop = year_data["pop"] if "gdp" in year_data: gdp = year_data["gdp"] if "ppp" in year_data: ppp = year_data["ppp"] table.insert([year, country, pop, gdp, ppp])
def parse_int(): for year in config.STUDY_YEARS: tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year) colnames = [ "from_country", "to_country", "commodity", "industry", "value"] coltypes = [ "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float"] use_table = SQLTable(tablename, colnames, coltypes).create() tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year) colnames = ["country", "industry", "commodity", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] make_table = SQLTable(tablename, colnames, coltypes).create() filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4] subdir = os.path.join("wiod", "intsuts_analytic") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) for country in config.countries.keys(): sheet = wb.sheet_by_name("USE_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) # notes say Use tables are broken down by origin from_country = row[1] # stupid hack so i don't have to change char(3) if from_country == "ZROW": from_country = "RoW" com_code = commodity_tracker.set_code(row[2], row[3]) if not com_code: continue for j in range(4, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first use_table.insert( [from_country, country, com_code, ind_code, value]) sheet = wb.sheet_by_name("SUP_%s" % country) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # industry first make_table.insert( [country, ind_code, com_code, value])
def parse_io(): ### for ind x ind tables tables = {} colnames = ["country", "from_ind", "to_ind", "is_import", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "bool", "float"] for year in config.STUDY_YEARS: tablename = "%s.niot_%d" % (config.WIOD_SCHEMA, year) tables[year] = SQLTable(tablename, colnames, coltypes) #.create() tables[year].drop() tables[year].create() tables[year].truncate() va_sectors = set(config.va_sectors.values()) for country in config.countries.keys(): filename = "%s_NIOT_ROW_Apr12.xlsx" % country subdir = os.path.join("wiod", "niot") path = fileutils.getcache(filename, subdir) wb = openpyxl.load_workbook(filename=path, use_iterators=True) for year in config.STUDY_YEARS: imports = {} sheet = wb.get_sheet_by_name("%d" % year) rows = sheet.iter_rows() industry_row = None for row in rows: cell = row[0] if cell.internal_value == "(industry-by-industry)": industry_row = row break row = next(rows) # industry names industry_codes = [] for (code_cell, desc_cell) in zip(industry_row, row): code = code_cell.internal_value desc = desc_cell.internal_value industry_codes.append(industry_tracker.set_code(code, desc)) for row in rows: from_code = None from_desc = None is_import = False for (to_code, value_cell) in zip(industry_codes, row): column = value_cell.column value = value_cell.internal_value # excel columns use letters if column == "A": from_code = value_cell.internal_value elif column == "B": from_desc = value_cell.internal_value elif column == "C": from_code = industry_tracker.set_code( from_code, from_desc) if not from_code: break if type(value) is str and value == "Imports": is_import = True elif (column > "D" or len(column) > 1) \ and to_code and value != 0: tables[year].insert( [country, from_code, to_code, is_import, value]) ### for supply and use tables def parse_sut(sheet_name, table_prefix): tables = {} colnames = ["country", "commodity", "industry", "value"] coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"] for year in config.STUDY_YEARS: tablename = "%s_%d" % (table_prefix, year) tables[year] = SQLTable(tablename, colnames, coltypes).create() tables[year].truncate() for country in config.countries.keys(): # TODO: more automated way to get this if country in ("AUS", "DEU", "GBR", "USA"): filename = "%s_SUT_Feb12.xls" % country else: filename = "%s_SUT_Jan12.xls" % country subdir = os.path.join("wiod", "suts") path = fileutils.getcache(filename, subdir) wb = xlrd.open_workbook(path) # extract supply and use tables at fob prices sheet = wb.sheet_by_name(sheet_name) industry_row = sheet.row_values(0) row = sheet.row_values(1) industry_codes = [] for (code, desc) in zip(industry_row, row): industry_codes.append(industry_tracker.set_code(code, desc)) for i in range(2, sheet.nrows): row = sheet.row_values(i) if not len(row[0].strip()): continue year = int(row[0]) if year not in config.STUDY_YEARS: continue com_code = commodity_tracker.set_code(row[1], row[2]) if not com_code: continue for j in range(3, len(row)): value = row[j] ind_code = industry_codes[j] if value != 0 and ind_code: # commodity first tables[year].insert( [country, com_code, ind_code, value]) # make tables parse_sut("SUP_bas", "%s.make" % config.WIOD_SCHEMA) # use tables parse_sut("USE_bas", "%s.use" % config.WIOD_SCHEMA)
class IOTableStateTracker(TableStateTracker): def __init__(self): TableStateTracker.__init__(self) self.make_table = None self.use_table = None self.make_insert_count = 0 self.use_insert_count = 0 def flush(self): TableStateTracker.flush(self) if self.make_insert_count: print("%d rows inserted to make table" % self.make_insert_count) self.make_insert_count = 0 if self.use_insert_count: print("%d rows inserted to use table" % self.use_insert_count) self.use_insert_count = 0 def create_make_table(self, year): print("creating make table for %s..." % year) tablename = "%s.make_%s" % (config.IO_SCHEMA, year) self.make_table = SQLTable(tablename, ["industry", "commodity", "thousands"], ["varchar(6)", "varchar(6)", "bigint"]) self.make_table.create() self.make_table.truncate() def create_use_table(self, year, has_margins=False): print("creating use table for %s..." % year) cols = ["commodity", "industry", "thousands"] coltypes = ["varchar(6)", "varchar(6)", "bigint"] if has_margins: for field in bea.use_table_margins: cols.append(field) coltypes.append("int") tablename = "%s.use_%s" % (config.IO_SCHEMA, year) self.use_table = SQLTable(tablename, cols, coltypes) self.use_table.create() self.use_table.truncate() def insert_make(self, indus, commod, makeval, factor=1): value = float(makeval) * factor if (value != 0): self.make_table.insert([indus.strip(),commod.strip(), int(value)]) self.make_insert_count += 1 def insert_use(self, commod, indus, useval, margins={}, factor=1): useval = float(useval) * factor nonzero = useval values = [commod.strip(), indus.strip(), int(useval)] if len(margins) > 0: for margin_field in bea.use_table_margins: value = 0 if margin_field in margins: value = float(margins[margin_field]) * factor if value: nonzero += value values.append(value) if nonzero != 0: self.use_table.insert(values) self.use_insert_count += 1 # this is for years with no distinction between # make and use tables def create_simple_transaction_table(self, year, filename, factor=1): print("creating transations table for %s..." % year) tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year) xtable = SQLTable(tablename, ["producer", "consumer", "thousands"], ["varchar(6)", "varchar(6)", "int"]) xtable.create() xtable.truncate() insert_count = 0 with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) >= 3: value = float(cols[2]) * factor if (value != 0): xtable.insert([cols[0], cols[1], int(value)]) insert_count += 1 print ("%d rows inserted" % insert_count) # this is for years that have make and use but no margins def create_simple_make_use(self, year, filename, factor=1): self.create_make_table(year) self.create_use_table(year, has_margins=False) with open(fileutils.getcache(filename), "r") as f: for line in f: cols = line.split() if len(cols) == 4: input_ind = cols[0] # comm consumed (producing ind) output_ind = cols[1] # consuming ind (comm produced) use_dollars = cols[2] # use in producers' prices make_dollars = cols[3] # make in producers' prices self.insert_make(input_ind, output_ind, make_dollars, factor) self.insert_use(commod=input_ind, indus=output_ind, useval=use_dollars, factor=factor)
def doparse(): tablename = "%s.world_supplement" % config.WIOD_SCHEMA table = SQLTable(tablename, ["year", "country", "measurement", "value"], ["int", "char(3)", "varchar(8)", "float"]) table.create() table.truncate() # census data has more complete population counts country_fips = { "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE", "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL", "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN", "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE", "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL", "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX", "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN", "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA", } # this file spec is documented in the xlsx file from the archive path = fileutils.getcache("IDBext001.txt", "wsupp") with open(path, "r") as fh: for line in fh: fields = line.split("|") if len(fields) == 3: fips = fields[0] if fips in country_fips: year = int(fields[1]) country = country_fips[fips] table.insert([year, country, "pop", int(fields[2])]) # worldbank data has some deflator data that imf doesn't worldbank = { "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls", #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls", #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls", #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls", "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls", } for (indicator, filename) in worldbank.items(): path = fileutils.getcache(filename, "wsupp") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) header = [int(x) for x in sheet.row_values(0)[2:]] for i in range(1, sheet.nrows): row = sheet.row_values(i) if row[1] in config.countries: country = row[1] for (year, value) in zip(header, row[2:]): if type(value) is float and value != 0: table.insert([year, country, indicator, value]) imf_fields = ( "LP", # population "PPPPC", # ppp per capita "NGDPRPC", # gdp per capita in constant prices "NGDP_D", # gdp deflator ) # this is actually a csv file despite what it's called path = fileutils.getcache("WEOApr2012all.xls", "wsupp") with codecs.open(path, "r", "cp1252") as fh: csvf = csv.reader(fh, dialect=csv.excel_tab) header = next(csvf) year_cols = {} valid_year = re.compile("\d{4}") valid_float = re.compile("-*[\d\.,]+") for i in range(len(header)): if header[i] == "ISO": country_col = i elif header[i] == "WEO Subject Code": subject_col = i elif valid_year.match(header[i]): year_cols[int(header[i])] = i elif header[i] == "Estimates Start After": last_year_col = i for row in csvf: if len(row) > subject_col and row[subject_col] in imf_fields: field = row[subject_col] country = row[country_col] if country not in config.countries: continue if valid_year.match(row[last_year_col]): last_year = int(row[last_year_col]) else: # not clear if this means all values are estimated last_year = 9999 for (year, colnum) in year_cols.items(): value = row[colnum] if valid_float.match(value): #and year < last_year: table.insert([year, country, field, float(value.replace(",", ""))])
def parse_env(): # parse english env files # TODO: might want to use the energy table as well. # it is very comprehensive, but formatted differently and only has 2001 sector_whitelist = ("Household Consumption", "Fixed Capital Formation") eng_env_years = [1999, 2001, 2004] eng_env_files = { "air_pol": { "filename": "IO_air.xls", "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"], }, "water_pol": { "filename": "IO_pol_water.xls", "columns": ["BOD", "COD", "SS"], }, "waste_pol": { "filename": "IO_waste.xls", "columns": ["Total waste", "General waste", "Hazardous waste", "Total waste - improper disposal", "General waste - improper disposal", "Hazardous waste - improper disposal"], }, "water_use": { "filename": "IO_res_water.xls", "columns": ["Natural water", "Abstracted water"], }, } tables_by_year = {} for year in eng_env_years: if year not in tables_by_year: tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() tables_by_year[year] = table else: table = tables_by_year[year] first_file = True for (tkey, tdata) in eng_env_files.items(): path = fileutils.getdatapath(tdata["filename"], "tw-env") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("year %d" % year) for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1 and \ (regexes.is_num(row[0]) or row[1] in sector_whitelist): sector = row[1].strip() if first_file: # these columns are repeated in every file table.insert([sector, "Total Output", row[2]]) table.insert([sector, "Total Input", row[3]]) table.insert([sector, "GDP", row[4]]) first_file = False for i in range(len(tdata["columns"])): table.insert([sector, tdata["columns"][i], row[i+5]]) # parse chinese env tables # this is file that we created by compiling older chinse data and # manually copying info from latest (2010) pdf files # skip 2001 because the english version is better sheetnames_by_year = { 2000: ["89年空汙", "89年廢棄物"], 2002: ["91年空汙", "91年廢棄物"], 2003: ["92年空汙", "92年廢棄物"], 2010: ["99年空汙", "99年水汙", "99年廢棄物"], } path = fileutils.getdatapath("sheets.xls", "tw-env") wb = xlrd.open_workbook(path) for (year, sheetnames) in sheetnames_by_year.items(): tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() for sheetname in sheetnames: sheet = wb.sheet_by_name(sheetname) header = sheet.row_values(0) # the 2010 tables have several rows that we don't want should_parse = (year != 2010) for i in range(1, sheet.nrows): row = sheet.row_values(i) if should_parse: sector = row[0].strip() for i in range (1, len(header)): measurement = header[i].strip() value = row[i] table.insert([sector, measurement, value]) elif row[0] in ("依行業分", "依部門分"): should_parse = True