def parse_io(): files = { 2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"), 2007: fileutils.getdatapath("0101.xls", "cn-io", "中国投入产出表2007", "excel"), } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_io_table(year) codes = tables.new_sector_codes(year) wb = xlrd.open_workbook(filename) # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB sheet = wb.sheet_by_index(0) ind_codes = None # the excel files also have this evil problem of merging # cells for appearance and not meaning. we only have 2 # years so curate them codes.set_code("FU101", "农村居民消费") codes.set_code("FU102", "城镇居民消费") codes.set_code("FU103", "政府消费支出") codes.set_code("FU201", "固定资本形成总额") codes.set_code("FU202", "存货增加") codes.set_code("GCF", "资本形成合计") codes.set_code("EX", "出口") codes.blacklist_code("TI") codes.blacklist_code("TII") for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if type(cell) is str and cell.strip("0") == "1": ind_codes = [] break if ind_codes is not None: for cell in row[3:]: if type(cell) is float: cell = str(int(cell)) if regexes.is_num(cell) or table.has_code(cell): ind_codes.append(cell) else: ind_codes.append(None) else: from_code = codes.set_code(row[2], row[1]) if from_code: for (value, to_code) in zip(row[3:], ind_codes): if to_code is not None: tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_io(): files = { 2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"), 2007: fileutils.getdatapath( "0101.xls", "cn-io", "中国投入产出表2007", "excel"), } tables = HybridTableCreator(config.SCHEMA) for (year, filename) in files.items(): tables.add_io_table(year) codes = tables.new_sector_codes(year) wb = xlrd.open_workbook(filename) # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB sheet = wb.sheet_by_index(0) ind_codes = None # the excel files also have this evil problem of merging # cells for appearance and not meaning. we only have 2 # years so curate them codes.set_code("FU101", "农村居民消费") codes.set_code("FU102", "城镇居民消费") codes.set_code("FU103", "政府消费支出") codes.set_code("FU201", "固定资本形成总额") codes.set_code("FU202", "存货增加") codes.set_code("GCF", "资本形成合计") codes.set_code("EX", "出口") codes.blacklist_code("TI") codes.blacklist_code("TII") for i in range(sheet.nrows): row = sheet.row_values(i) if ind_codes is None: for cell in row: if type(cell) is str and cell.strip("0") == "1": ind_codes = [] break if ind_codes is not None: for cell in row[3:]: if type(cell) is float: cell = str(int(cell)) if regexes.is_num(cell) or table.has_code(cell): ind_codes.append(cell) else: ind_codes.append(None) else: from_code = codes.set_code(row[2], row[1]) if from_code: for (value, to_code) in zip(row[3:], ind_codes): if to_code is not None: tables.insert_io(year, from_code, to_code, value) codes.update_codes()
def parse_map(): table = SQLTable( "%s.code_map" % config.SCHEMA, ["from_code", "to_code", "env_code", "harmonized", "description"], ["varchar(3)", "varchar(6)", "varchar(31)", "char(3)", "text" ]).create() table.truncate() filename = "code_map.xls" path = fileutils.getdatapath(filename, "uk") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) def sanitize_code(code): if type(code) is float: code = str(int(code)) if not len(code): code = None return code for i in range(1, sheet.nrows): row = sheet.row_values(i) from_code = sanitize_code(row[0]) to_code = sanitize_code(row[2]) env_code = sanitize_code(row[4]) harmonized = sanitize_code(row[6]) desc = row[7].strip() table.insert([from_code, to_code, env_code, harmonized, desc])
def parse_map(): table = SQLTable("%s.code_map" % config.SCHEMA, ["from_code", "to_code", "env_code", "harmonized", "description"], ["varchar(3)", "varchar(6)", "varchar(31)", "char(3)", "text"]).create() table.truncate() filename = "code_map.xls" path = fileutils.getdatapath(filename, "uk") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_index(0) def sanitize_code(code): if type(code) is float: code = str(int(code)) if not len(code): code = None return code for i in range(1, sheet.nrows): row = sheet.row_values(i) from_code = sanitize_code(row[0]) to_code = sanitize_code(row[2]) env_code = sanitize_code(row[4]) harmonized = sanitize_code(row[6]) desc = row[7].strip() table.insert([from_code, to_code, env_code, harmonized, desc])
def get_centroid(country): if country not in WorldMapPlot.__centroids: filename = fileutils.getdatapath(country + ".dat", "gnuplot") with open(filename) as f: longest_ring = None current_ring_len = 0 total_lon = 0 total_lat = 0 for line in f: coords = line.split() # turns out the first ring is the most representative # in esri's data, so we don't need to use other rings if len(coords) != 2 and current_ring_len: break total_lon += float(coords[0]) total_lat += float(coords[1]) current_ring_len += 1 mean_lon = total_lon / current_ring_len mean_lat = total_lat / current_ring_len WorldMapPlot.__centroids[country] = (mean_lon, mean_lat) return WorldMapPlot.__centroids[country]
def parse_codes(): path = fileutils.getdatapath("sector_map.xls", "tw-env") wb = xlrd.open_workbook(path) sheets = wb.sheets() for sheet in sheets: year = int(sheet.name) tablename = "%s.sector_map_%d" % (config.SCHEMA, year) if year > 2006: # CxI only, need map between commods and inds colnames = ["io_sector", "env_sector", "harmonized_env", "io_commod", "io_ind"] coltypes = ["varchar(255)"]*5 else: colnames = ["io_sector", "env_sector", "harmonized_env"] coltypes = ["varchar(255)"]*3 table = SQLTable(tablename, colnames, coltypes) table.create() table.truncate() for i in range(sheet.nrows): row = sheet.row_values(i) io_sector = row[0].strip() env_sector = row[1].strip() harmonized_env = row[2] if type(harmonized_env) is float: harmonized_env = str(int(harmonized_env)) harmonized_env = harmonized_env.strip() if year > 2006: io_commod = row[4].strip() io_ind = row[5].strip() table.insert([io_sector, env_sector, harmonized_env, io_commod, io_ind]) else: table.insert([io_sector, env_sector, harmonized_env])
def parse_codes(): comcodes = parserutils.add_tracker("%s.com_codes" % config.SCHEMA, "w") filename = fileutils.getdatapath("commodities.csv", "ca") with open(filename, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) and regexes.is_num(row[0]): comcodes.set_code(row[0], row[1]) comcodes.update_codes() maptable = SQLTable("%s.sector_map" % config.SCHEMA, ["io_code", "env_code", "harmonized"], ["varchar(15)", "varchar(15)", "varchar(15)"]).create() indcodes = parserutils.add_tracker("%s.ind_codes" % config.SCHEMA, "w") filename = fileutils.getdatapath("industries.csv", "ca") with open(filename, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) >= 5: io_code = row[0] if not len(io_code): io_code = None elif len(row[1]): indcodes.set_code(io_code, row[1]) env_code = row[2] if not len(env_code): env_code = None elif len(row[3]): indcodes.set_code(env_code, row[3]) harmonized = row[4] if len(harmonized) and regexes.is_num(harmonized): indcodes.set_code(harmonized, row[5]) maptable.insert([io_code, env_code, harmonized]) indcodes.update_codes()
def parse_codes(): ## manually curated sector map table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA, ["io_code", "env_code", "description"], ["varchar(15)", "varchar(15)", "text"]).create() table.truncate() sector_map = fileutils.getdatapath("sector_map.csv", "wiod") fh = open(sector_map, "r") csvf = csv.reader(fh) header = next(csvf) for row in csvf: io_code = row[0].strip() if not len(io_code): io_code = None env_code = row[1].strip() if not len(env_code): env_code = None desc = row[2].strip() table.insert([io_code, env_code, desc]) ## current exchange rates table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA, ["country", "year", "rate"], ["char(3)", "int", "float"]).create() table.truncate() path = fileutils.getcache("exr_wiod.xls", "wiod") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("EXR") year_list = None for i in range(sheet.nrows): row = sheet.row_values(i) if len(row) < 2: continue if year_list is None: if type(row[0]) is str and row[0].strip() == "Country": year_list = [int(cell.strip("_ ")) for cell in row[2:]] else: if type(row[1]) is str and len(row[1].strip()) == 3: country = row[1] if country == "GER": country = "DEU" for (year, value) in zip(year_list, row[2:]): table.insert([country, year, value])
def parse_codes(): path = fileutils.getdatapath("sector_map.xls", "tw-env") wb = xlrd.open_workbook(path) sheets = wb.sheets() for sheet in sheets: year = int(sheet.name) tablename = "%s.sector_map_%d" % (config.SCHEMA, year) if year > 2006: # CxI only, need map between commods and inds colnames = [ "io_sector", "env_sector", "harmonized_env", "io_commod", "io_ind" ] coltypes = ["varchar(255)"] * 5 else: colnames = ["io_sector", "env_sector", "harmonized_env"] coltypes = ["varchar(255)"] * 3 table = SQLTable(tablename, colnames, coltypes) table.create() table.truncate() for i in range(sheet.nrows): row = sheet.row_values(i) io_sector = row[0].strip() env_sector = row[1].strip() harmonized_env = row[2] if type(harmonized_env) is float: harmonized_env = str(int(harmonized_env)) harmonized_env = harmonized_env.strip() if year > 2006: io_commod = row[4].strip() io_ind = row[5].strip() table.insert( [io_sector, env_sector, harmonized_env, io_commod, io_ind]) else: table.insert([io_sector, env_sector, harmonized_env])
def get_plot_clauses(self): clauses = [ "with lines lc rgb '#CCCCCC' title ''" # qualifier for continent ] self.prep_colorspecs() values = ((v, k) for (k, v) in self.country_values.items()) for (value, country) in sorted(values, reverse=True): filename = fileutils.getdatapath(country + ".dat", "gnuplot") clauseparts = [] clauseparts.append("'%s' with filledcurve" % filename) clauseparts.append("lc %s" % self.get_colorspec(value)) if country in self.keytitles: clauseparts.append("title '%s'" % self.keytitles[country]) else: clauseparts.append("title ''") # outlines clauses.append(" ".join(clauseparts)) clauses.append( "'%s' with lines lc rgb 'black' lw 0.3 title ''" % filename) if len(self.miniplots): # keep colors in order inverse = dict((v, k) for (k, v) in self.miniplot_legend.items()) for color in WorldMapPlot.__better_colors: if color in inverse: # fake legend by plotting nonexistent data clauses.append( "NaN title '%s' lw 10 lc rgb '%s'" % (inverse[color], color)) return clauses
def doparse(): table = SQLTable("%s.mdg_emissions" % config.UN_SCHEMA, ["country", "year", "value"], ["char(3)", "int", "float"]).create() table.truncate() country_dict = dict((v, k) for k, v in config.countries.items()) country_dict["Slovakia"] = "SVK" country_dict["Russian Federation"] = "RUS" year_pat = re.compile("[12]\d{3}") path = fileutils.getdatapath("mdg_emissions.csv", "un") with open(path, "r") as fh: csvf = csv.reader(fh) header = next(csvf) header_index = {} years = [] for i in range(len(header)): header_index[header[i]] = i if year_pat.match(header[i]): years.append(header[i]) for row in csvf: if len(row) <= header_index["SeriesCode"] or \ row[header_index["SeriesCode"]] != "749": continue country_name = row[header_index["Country"]] if country_name not in country_dict: continue country = country_dict[country_name] for year in years: value = row[header_index[year]].strip() if len(value): table.insert([country, int(year), float(value)])
def parse_env(): # parse english env files # TODO: might want to use the energy table as well. # it is very comprehensive, but formatted differently and only has 2001 sector_whitelist = ("Household Consumption", "Fixed Capital Formation") eng_env_years = [1999, 2001, 2004] eng_env_files = { "air_pol": { "filename": "IO_air.xls", "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"], }, "water_pol": { "filename": "IO_pol_water.xls", "columns": ["BOD", "COD", "SS"], }, "waste_pol": { "filename": "IO_waste.xls", "columns": [ "Total waste", "General waste", "Hazardous waste", "Total waste - improper disposal", "General waste - improper disposal", "Hazardous waste - improper disposal" ], }, "water_use": { "filename": "IO_res_water.xls", "columns": ["Natural water", "Abstracted water"], }, } tables_by_year = {} for year in eng_env_years: if year not in tables_by_year: tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() tables_by_year[year] = table else: table = tables_by_year[year] first_file = True for (tkey, tdata) in eng_env_files.items(): path = fileutils.getdatapath(tdata["filename"], "tw-env") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("year %d" % year) for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1 and \ (regexes.is_num(row[0]) or row[1] in sector_whitelist): sector = row[1].strip() if first_file: # these columns are repeated in every file table.insert([sector, "Total Output", row[2]]) table.insert([sector, "Total Input", row[3]]) table.insert([sector, "GDP", row[4]]) first_file = False for i in range(len(tdata["columns"])): table.insert([sector, tdata["columns"][i], row[i + 5]]) # parse chinese env tables # this is file that we created by compiling older chinse data and # manually copying info from latest (2010) pdf files # skip 2001 because the english version is better sheetnames_by_year = { 2000: ["89年空汙", "89年廢棄物"], 2002: ["91年空汙", "91年廢棄物"], 2003: ["92年空汙", "92年廢棄物"], 2010: ["99年空汙", "99年水汙", "99年廢棄物"], } path = fileutils.getdatapath("sheets.xls", "tw-env") wb = xlrd.open_workbook(path) for (year, sheetnames) in sheetnames_by_year.items(): tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() for sheetname in sheetnames: sheet = wb.sheet_by_name(sheetname) header = sheet.row_values(0) # the 2010 tables have several rows that we don't want should_parse = (year != 2010) for i in range(1, sheet.nrows): row = sheet.row_values(i) if should_parse: sector = row[0].strip() for i in range(1, len(header)): measurement = header[i].strip() value = row[i] table.insert([sector, measurement, value]) elif row[0] in ("依行業分", "依部門分"): should_parse = True
def parse_codes(): # parse sector maps path = fileutils.getdatapath("io_env_map.xls", "jp") wb = xlrd.open_workbook(path) io_tables = {} env_tables = {} harmonized_sectors = {} harmonized_table = SQLTable( "%s.harmonized_codes" % config.SCHEMA, ["code", "description"], ["char(3)", "varchar(63)"]).create() for year in config.STUDY_YEARS: # all io codes are in one sheet, parse afterward io_tables[year] = SQLTable( "%s.io_map_%d" % (config.SCHEMA, year), ["io_sector", "description", "harmonized"], ["char(3)", "varchar(63)", "char(3)"]).create() io_tables[year].truncate() # parse env codes env_table = SQLTable( "%s.env_map_%d" % (config.SCHEMA, year), ["env_sector", "description", "harmonized"], ["varchar(7)", "varchar(63)", "char(3)"]).create() env_table.truncate() sheet = wb.sheet_by_name(str(year)) for i in range(1, sheet.nrows): row = sheet.row_values(i) code = row[0] if type(code) is float: # 2005 codes are 5 or more digits so this just trims .0 code = str(int(code)).rjust(3, "0") desc = row[1] h_code = row[2] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") env_table.insert([code, desc, h_code]) if h_code not in harmonized_sectors: h_desc = row[3] harmonized_sectors[h_code] = 1 harmonized_table.insert([h_code, h_desc]) sheet = wb.sheet_by_name("io") positions = {} header = sheet.row_values(0) for i in range(len(header)): if type(header[i]) is float: positions[int(header[i])] = i elif header[i] == "harmonized": positions["harmonized"] = i for i in range(1, sheet.nrows): row = sheet.row_values(i) for year in config.STUDY_YEARS: code = row[positions[year]] if type(code) is float: code = str(int(code)).rjust(3, "0") if code is None or not len(code): continue desc = row[positions[year] + 1] h_code = row[positions["harmonized"]] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") io_tables[year].insert([code, desc, h_code])
def parse_nipa_data(): test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA db.execute("DROP VIEW IF EXISTS %s" % test_view) # get table for pce category harmonization trailing_pat = re.compile('(.+) \(.*\d.*\)$') nipa_code_map = {} filename = fileutils.getdatapath("nipa_code_map.csv", "usa") fh = open(filename) csvf = csv.reader(fh) for row in csvf: if len(row) == 2: harmonized = row[0] trailing = trailing_pat.match(harmonized) if trailing: harmonized = trailing.group(1) nipa_code_map[row[1]] = harmonized fh.close() # get nipa series codes from underlying detail tables tracker = TableStateTracker() tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA, ["code", "parent", "description"], ["char(7)", "char(7)", "text"], True) number_pat = re.compile('^\d+$') trailing_pat = re.compile('(.+) \(.*\d.*\)$') filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False code_stack = [None] indent_stack = [-1] # the code mapping has been done such that each item is at least at # three levels of disaggregation below the top, i.e. there is always # an ancestor at the second level. we only want to keep track of the # ancestor at the third level (root is zero) # the first level below root has goods and services # the second level has durable goods, nondurable goods, and services. reverse_code_dict = {} second_level_nodes = [] for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 2.4.5U"): is_in_table = True else: if row[0].startswith("Table 2.4.5U"): # we only need to go through one instance of this table break else: if number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() # these are duplicate codes if title.startswith("Market-based PCE"): continue code = row[2] current_indent = len(row[1]) - len(title) while current_indent <= indent_stack[-1]: indent_stack.pop() code_stack.pop() indent_stack.append(current_indent) code_stack.append(code) if len(code_stack) > 1: parent = code_stack[-2] else: parent = None title = title.strip() trailing = trailing_pat.match(title) if trailing: title = trailing.group(1) if len(code_stack) > 4: reverse_code_dict[title] = code_stack[3] else: reverse_code_dict[title] = code tracker.insert_row((code, parent, title)) tracker.flush() fh.close() # table for price deflators tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA, ["year", "gdp", "pce"], ["int", "float", "float"]) filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False data = {} # we need to parse two rows before we can populate years = {} for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 1.1.9"): is_in_table = True else: if row[0].startswith("Table 1.1.9"): # this is seasonally adjusted version of the same table break else: if row[0] == "Line": for i in range(len(row)): if number_pat.match(row[i]): year = int(row[i]) years[year] = i data[year] = {} elif number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() if title == "Gross domestic product": column = "gdp" elif title == "Personal consumption expenditures": column = "pce" else: continue for (year, colindex) in years.items(): data[year][column] = float(row[colindex]) for (year, results) in data.items(): tracker.insert_row([year, results["gdp"], results["pce"]]) tracker.flush() fh.close() # parse pce bridge class IONIPAStateTracker(TableStateTracker): def flush(self): TableStateTracker.flush(self) if self.fh is not None and not self.fh.closed: self.fh.close() def __init__(self): TableStateTracker.__init__(self) self.fh = None self.code_dict = None self.value_columns = [ "prod_val", "rail_margin", "truck_margin", "water_margin", "air_margin", "pipe_margin", "gaspipe_margin", "wholesale_margin", "retail_margin", "purchase_val" ] self.old_style_field_map = { "Producers' Value": "prod_val", "MfgExciseTax": "prod_val", "RailMargin": "rail_margin", "TruckMargin": "truck_margin", "WaterMargin": "water_margin", "AirMargin": "air_margin", "PipeMargin": "pipe_margin", "WholesaleMargin": "wholesale_margin", "WholesaleTax": "wholesale_margin", "RetailMargin": "retail_margin", "RetailSalesTax": "retail_margin", "OtherRetailTax": "retail_margin", "Purchasers' Value": "purchase_val", } def set_filename(self, filename): path = fileutils.getcache(filename, str(self.year)) self.filename = path def set_year(self, year): self.flush() self.year = year tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year) fields = ["pce_code", "commodity"] + self.value_columns types = ["varchar(6)", "varchar(6)"] + \ ["bigint"]*len(self.value_columns) self.create_table(tablename, fields, types) def setup_for_codes(self): self.code_dict = {} def flush_codes(self): if self.code_dict is not None: tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year) self.create_table(tablename, ["pce_code", "nipa_group", "description"], ["varchar(6)", "char(7)", "text"]) for (code, raw_desc) in self.code_dict.items(): desc = raw_desc if desc.endswith('(s.)') or desc.endswith('(d.)'): desc = desc[:-4].strip() elif desc.endswith('(n.d.)'): desc = desc[:-6].strip() if desc in nipa_code_map: desc = nipa_code_map[desc] if desc in reverse_code_dict: nipa_code = reverse_code_dict[desc] else: nipa_code = None #self.current_stmt(code, nipa_code, raw_desc) self.table.insert([code, nipa_code, raw_desc]) self.code_dict = None self.flush() def insert_code_row(self, code, desc): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(code) is float: code = int(code) self.code_dict[str(code)] = desc.strip() def insert_row(self, pce_code, commod, dollar_values, factor=1): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(pce_code) is float: pce_code = int(pce_code) values = [str(pce_code).strip(), commod.strip()] for column in self.value_columns: if column in dollar_values: if factor == 1: values.append(dollar_values[column]) else: values.append( int(float(dollar_values[column]) * factor)) else: values.append(None) #self.current_stmt(*values) self.table.insert(values) def parse_old_style_xls(self, year): self.set_year(year) self.set_filename("%d_PCE_Commodity.xls" % self.year) wb = xlrd.open_workbook(self.filename) # parse pce bridge data sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year) field_indexes = {} pce_code_idx = 0 commod_idx = 2 for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: if "PCE Category" in row: pce_code_idx = row.index("PCE Category") if "Commodity" in row: commod_idx = row.index("Commodity") for i in range(len(row)): xls_col = row[i] if xls_col in self.old_style_field_map: colname = self.old_style_field_map[xls_col] if colname not in field_indexes: field_indexes[colname] = [] field_indexes[colname].append(i) elif len(field_indexes): pce_code = row[pce_code_idx] commod = str(int(row[commod_idx])).rjust(6, "0") values = {} for (field, columns) in field_indexes.items(): # doclumentation says units are in 100,000 dollars # but the orders of magnitude don't match up with # later years if we use 100 components = [ int(float(row[column] * 1000)) for column in columns ] value = 0 for component in components: value += component values[field] = value self.insert_row(pce_code, commod, values) # parse codes from neighboring worksheet self.setup_for_codes() sheet = wb.sheet_by_name("%d PCE Category Descriptions" % self.year) code_idx = None desc_idx = None for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: codetab = "PCE Category Code" codetab2 = "%s - %d" % (codetab, self.year) if codetab in row or codetab2 in row: if codetab in row: code_idx = row.index(codetab) else: code_idx = row.index(codetab2) desctab = "PCE Category Description - %d" % self.year if desctab in row: desc_idx = row.index(desctab) else: desctab = "PCE Category Description" if desctab in row: desc_idx = row.index(desctab) elif code_idx is not None and desc_idx is not None: code = row[code_idx] desc = str(row[desc_idx]) self.insert_code_row(code, desc) self.flush_codes() def get_file_handle(self, filetype, options={}): if filetype == "txt": self.fh = open(self.filename) return self.fh elif filetype == "csv": self.fh = open(self.filename) if "delim" in options: csvf = csv.reader(self.fh, delimiter=options["delim"]) else: csvf = csv.reader(self.fh) return csvf elif filetype == "xls": wb = xlrd.open_workbook(self.filename) return wb def parse_text(self, rowcallback): path = fileutils.getcache(filename, str(self.year)) f = open(path) for line in f: rowcallback(line, this) f.close() tracker = IONIPAStateTracker() tracker.parse_old_style_xls(1967) tracker.parse_old_style_xls(1972) tracker.parse_old_style_xls(1977) tracker.parse_old_style_xls(1982) tracker.set_year(1987) tracker.set_filename("tbld-87.dat") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 103: continue commod = line[0:6] pce_code = line[14:18] values = { "prod_val": line[21:30], "rail_margin": line[30:39], "truck_margin": line[39:48], "water_margin": line[48:57], "air_margin": line[57:66], "pipe_margin": line[66:75], "wholesale_margin": line[75:84], "retail_margin": line[84:93], "purchase_val": line[93:102], } tracker.insert_row(pce_code, commod, values, 1000) tracker.setup_for_codes() tracker.set_filename("io-nipa.doc") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 27: continue code = line[0:4].strip() desc = line[26:].strip() tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1992) tracker.set_filename("TabD.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: values = { "prod_val": row[4], "rail_margin": row[5], "truck_margin": row[6], "water_margin": row[7], "air_margin": row[8], "pipe_margin": row[9], "gaspipe_margin": row[10], "wholesale_margin": row[11], "retail_margin": row[12], "purchase_val": row[13], } tracker.insert_row(row[2], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: code = row[0] desc = row[4] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1997) tracker.set_filename("AppendixC_Detail.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(row[1], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA_PCE.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: code = row[1] desc = row[2] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(2002) tracker.setup_for_codes() # do this simultaneously since it's all one file tracker.set_filename("2002_PCE_Bridge.xls") wb = tracker.get_file_handle("xls") naics_pat = re.compile('[A-Z0-9]{6}') sheet = wb.sheet_by_name("PCE_Bridge_Detail") pce_codes = [] for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) == 13 and naics_pat.match(row[1]): pce_desc = row[0] # we don't need the distinction between households and # nonprofit institutions service households parts = pce_desc.split('-') if len(parts) > 1: lastpart = parts[-1].strip() if lastpart == 'HH' or lastpart == 'NPISH': pce_desc = '-'.join(parts[:-1]) pce_desc = pce_desc.strip() if pce_desc in pce_codes: pce_code = pce_codes.index(pce_desc) else: pce_code = len(pce_codes) pce_codes.append(pce_desc) tracker.insert_code_row(str(pce_code), pce_desc) values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(str(pce_code), row[1], values, 1000) tracker.flush_codes()
def get_data_location(self): return fileutils.getdatapath("continent.dat", "gnuplot")
def parse_nipa_data(): test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA db.execute("DROP VIEW IF EXISTS %s" % test_view) # get table for pce category harmonization trailing_pat = re.compile('(.+) \(.*\d.*\)$') nipa_code_map = {} filename = fileutils.getdatapath("nipa_code_map.csv", "usa") fh = open(filename) csvf = csv.reader(fh) for row in csvf: if len(row) == 2: harmonized = row[0] trailing = trailing_pat.match(harmonized) if trailing: harmonized = trailing.group(1) nipa_code_map[row[1]] = harmonized fh.close() # get nipa series codes from underlying detail tables tracker = TableStateTracker() tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA, ["code", "parent", "description"], ["char(7)", "char(7)", "text"], True) number_pat = re.compile('^\d+$') trailing_pat = re.compile('(.+) \(.*\d.*\)$') filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False code_stack = [None] indent_stack = [-1] # the code mapping has been done such that each item is at least at # three levels of disaggregation below the top, i.e. there is always # an ancestor at the second level. we only want to keep track of the # ancestor at the third level (root is zero) # the first level below root has goods and services # the second level has durable goods, nondurable goods, and services. reverse_code_dict = {} second_level_nodes = [] for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 2.4.5U"): is_in_table = True else: if row[0].startswith("Table 2.4.5U"): # we only need to go through one instance of this table break else: if number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() # these are duplicate codes if title.startswith("Market-based PCE"): continue code = row[2] current_indent = len(row[1]) - len(title) while current_indent <= indent_stack[-1]: indent_stack.pop() code_stack.pop() indent_stack.append(current_indent) code_stack.append(code) if len(code_stack) > 1: parent = code_stack[-2] else: parent = None title = title.strip() trailing = trailing_pat.match(title) if trailing: title = trailing.group(1) if len(code_stack) > 4: reverse_code_dict[title] = code_stack[3] else: reverse_code_dict[title] = code tracker.insert_row((code, parent, title)) tracker.flush() fh.close() # table for price deflators tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA, ["year", "gdp", "pce"], ["int", "float", "float"]) filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa") fh = open(filename) csvf = csv.reader(fh) is_in_table = False data = {} # we need to parse two rows before we can populate years = {} for row in csvf: if len(row): if not is_in_table: if row[0].startswith("Table 1.1.9"): is_in_table = True else: if row[0].startswith("Table 1.1.9"): # this is seasonally adjusted version of the same table break else: if row[0] == "Line": for i in range(len(row)): if number_pat.match(row[i]): year = int(row[i]) years[year] = i data[year] = {} elif number_pat.match(row[0]) and len(row) > 2: title = row[1].lstrip() if title == "Gross domestic product": column = "gdp" elif title == "Personal consumption expenditures": column = "pce" else: continue for (year, colindex) in years.items(): data[year][column] = float(row[colindex]) for (year, results) in data.items(): tracker.insert_row([year, results["gdp"], results["pce"]]) tracker.flush() fh.close() # parse pce bridge class IONIPAStateTracker(TableStateTracker): def flush(self): TableStateTracker.flush(self) if self.fh is not None and not self.fh.closed: self.fh.close() def __init__(self): TableStateTracker.__init__(self) self.fh = None self.code_dict = None self.value_columns = [ "prod_val", "rail_margin", "truck_margin", "water_margin", "air_margin", "pipe_margin", "gaspipe_margin", "wholesale_margin", "retail_margin", "purchase_val" ] self.old_style_field_map = { "Producers' Value": "prod_val", "MfgExciseTax": "prod_val", "RailMargin": "rail_margin", "TruckMargin": "truck_margin", "WaterMargin": "water_margin", "AirMargin": "air_margin", "PipeMargin": "pipe_margin", "WholesaleMargin": "wholesale_margin", "WholesaleTax": "wholesale_margin", "RetailMargin": "retail_margin", "RetailSalesTax": "retail_margin", "OtherRetailTax": "retail_margin", "Purchasers' Value": "purchase_val", } def set_filename(self, filename): path = fileutils.getcache(filename, str(self.year)) self.filename = path def set_year(self, year): self.flush() self.year = year tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year) fields = ["pce_code", "commodity"] + self.value_columns types = ["varchar(6)", "varchar(6)"] + \ ["bigint"]*len(self.value_columns) self.create_table(tablename, fields, types) def setup_for_codes(self): self.code_dict = {} def flush_codes(self): if self.code_dict is not None: tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year) self.create_table(tablename, ["pce_code", "nipa_group", "description"], ["varchar(6)", "char(7)", "text"]) for (code, raw_desc) in self.code_dict.items(): desc = raw_desc if desc.endswith('(s.)') or desc.endswith('(d.)'): desc = desc[:-4].strip() elif desc.endswith('(n.d.)'): desc = desc[:-6].strip() if desc in nipa_code_map: desc = nipa_code_map[desc] if desc in reverse_code_dict: nipa_code = reverse_code_dict[desc] else: nipa_code = None #self.current_stmt(code, nipa_code, raw_desc) self.table.insert([code, nipa_code, raw_desc]) self.code_dict = None self.flush() def insert_code_row(self, code, desc): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(code) is float: code = int(code) self.code_dict[str(code)] = desc.strip() def insert_row(self, pce_code, commod, dollar_values, factor=1): # workaround for the way excel interprets numbers as floats # when we know the codes should be strings if type(pce_code) is float: pce_code = int(pce_code) values = [str(pce_code).strip(), commod.strip()] for column in self.value_columns: if column in dollar_values: if factor == 1: values.append(dollar_values[column]) else: values.append(int(float(dollar_values[column]) * factor)) else: values.append(None) #self.current_stmt(*values) self.table.insert(values) def parse_old_style_xls(self, year): self.set_year(year) self.set_filename("%d_PCE_Commodity.xls" % self.year) wb = xlrd.open_workbook(self.filename) # parse pce bridge data sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year) field_indexes = {} pce_code_idx = 0 commod_idx = 2 for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: if "PCE Category" in row: pce_code_idx = row.index("PCE Category") if "Commodity" in row: commod_idx = row.index("Commodity") for i in range(len(row)): xls_col = row[i] if xls_col in self.old_style_field_map: colname = self.old_style_field_map[xls_col] if colname not in field_indexes: field_indexes[colname] = [] field_indexes[colname].append(i) elif len(field_indexes): pce_code = row[pce_code_idx] commod = str(int(row[commod_idx])).rjust(6, "0") values = {} for (field, columns) in field_indexes.items(): # doclumentation says units are in 100,000 dollars # but the orders of magnitude don't match up with # later years if we use 100 components = [int(float(row[column] * 1000)) for column in columns] value = 0 for component in components: value += component values[field] = value self.insert_row(pce_code, commod, values) # parse codes from neighboring worksheet self.setup_for_codes() sheet = wb.sheet_by_name("%d PCE Category Descriptions" % self.year) code_idx = None desc_idx = None for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1: codetab = "PCE Category Code" codetab2 = "%s - %d" % (codetab, self.year) if codetab in row or codetab2 in row: if codetab in row: code_idx = row.index(codetab) else: code_idx = row.index(codetab2) desctab = "PCE Category Description - %d" % self.year if desctab in row: desc_idx = row.index(desctab) else: desctab = "PCE Category Description" if desctab in row: desc_idx = row.index(desctab) elif code_idx is not None and desc_idx is not None: code = row[code_idx] desc = str(row[desc_idx]) self.insert_code_row(code, desc) self.flush_codes() def get_file_handle(self, filetype, options={}): if filetype == "txt": self.fh = open(self.filename) return self.fh elif filetype == "csv": self.fh = open(self.filename) if "delim" in options: csvf = csv.reader(self.fh, delimiter=options["delim"]) else: csvf = csv.reader(self.fh) return csvf elif filetype == "xls": wb = xlrd.open_workbook(self.filename) return wb def parse_text(self, rowcallback): path = fileutils.getcache(filename, str(self.year)) f = open(path) for line in f: rowcallback(line, this) f.close() tracker = IONIPAStateTracker() tracker.parse_old_style_xls(1967) tracker.parse_old_style_xls(1972) tracker.parse_old_style_xls(1977) tracker.parse_old_style_xls(1982) tracker.set_year(1987) tracker.set_filename("tbld-87.dat") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 103: continue commod = line[0:6] pce_code = line[14:18] values = { "prod_val": line[21:30], "rail_margin": line[30:39], "truck_margin": line[39:48], "water_margin": line[48:57], "air_margin": line[57:66], "pipe_margin": line[66:75], "wholesale_margin": line[75:84], "retail_margin": line[84:93], "purchase_val": line[93:102], } tracker.insert_row(pce_code, commod, values, 1000) tracker.setup_for_codes() tracker.set_filename("io-nipa.doc") fh = tracker.get_file_handle("txt") for line in fh: if len(line) < 27: continue code = line[0:4].strip() desc = line[26:].strip() tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1992) tracker.set_filename("TabD.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: values = { "prod_val": row[4], "rail_margin": row[5], "truck_margin": row[6], "water_margin": row[7], "air_margin": row[8], "pipe_margin": row[9], "gaspipe_margin": row[10], "wholesale_margin": row[11], "retail_margin": row[12], "purchase_val": row[13], } tracker.insert_row(row[2], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA.txt") fh = tracker.get_file_handle("csv", {"delim": "\t"}) for row in fh: code = row[0] desc = row[4] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(1997) tracker.set_filename("AppendixC_Detail.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(row[1], row[0], values, 1000) tracker.setup_for_codes() tracker.set_filename("IO-NIPA_PCE.txt") fh = tracker.get_file_handle("csv", {"delim": ","}) for row in fh: code = row[1] desc = row[2] tracker.insert_code_row(code, desc) tracker.flush_codes() tracker.set_year(2002) tracker.setup_for_codes() # do this simultaneously since it's all one file tracker.set_filename("2002_PCE_Bridge.xls") wb = tracker.get_file_handle("xls") naics_pat = re.compile('[A-Z0-9]{6}') sheet = wb.sheet_by_name("PCE_Bridge_Detail") pce_codes = [] for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) == 13 and naics_pat.match(row[1]): pce_desc = row[0] # we don't need the distinction between households and # nonprofit institutions service households parts = pce_desc.split('-') if len(parts) > 1: lastpart = parts[-1].strip() if lastpart == 'HH' or lastpart == 'NPISH': pce_desc = '-'.join(parts[:-1]) pce_desc = pce_desc.strip() if pce_desc in pce_codes: pce_code = pce_codes.index(pce_desc) else: pce_code = len(pce_codes) pce_codes.append(pce_desc) tracker.insert_code_row(str(pce_code), pce_desc) values = { "prod_val": row[3], "rail_margin": row[4], "truck_margin": row[5], "water_margin": row[6], "air_margin": row[7], "pipe_margin": row[8], "gaspipe_margin": row[9], "wholesale_margin": row[10], "retail_margin": row[11], "purchase_val": row[12], } tracker.insert_row(str(pce_code), row[1], values, 1000) tracker.flush_codes()
def doparse(): for year in (1972, 1977): table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year), ["code", "description"], ["char(6)", "text"]).create() table.truncate() filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa") with open(filepath, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) and len(row[0]): table.insert([row[0], row[1]]) if year == 1972: # this is stated in the rtf file for both 1972 and 1977 # but this code never appears in 1977, the documentation # was probably not properly updated table.insert(["870000", "total value added"]) writer = dbsetup.IOCodeTableWriter() writer.set_year(1982, "Io-code.doc") with open(writer.get_filename()) as f: for line in f: if len(line) > 8: code = line[:6] desc = line[8:] writer.writerow(code, desc) writer.set_year(1987, "SIC-IO.DOC") with open(writer.get_filename()) as f: pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)') for line in f: match = pattern.match(line) if match: code = match.group(1).rjust(2, '0') + match.group(2) desc = match.group(3).strip('(. \r\n') writer.writerow(code, desc) writer.set_year(1992, "io-code.txt") with open(writer.get_filename()) as f: for line in f: if len(line) > 7: code = line[:6] desc = line[7:] writer.writerow(code, desc) writer.set_year(1997, "IO-CodeDetail.txt") with open(writer.get_filename()) as f: csvf = csv.reader(f) for row in csvf: if len(row) == 2: writer.writerow(row[0], row[1]) writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt") with open(writer.get_filename()) as f: valid_line = re.compile("[A-Z0-9]{6}\s") line = f.readline().strip().replace("GasPipeVal", "GasPipe ") fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line)) codemap = {} for line in f: if valid_line.match(line): row = dbsetup.get_values_for_fields( dbsetup.replace_tabs(line), fields) codemap[row["Commodity"]] = row["CommodityDescription"] codemap[row["Industry"]] = row["IndustryDescription"] for (code, desc) in codemap.items(): writer.writerow(code, desc) writer.flush()
def doparse(): for year in (1972, 1977): table = SQLTable("%s.codes_%d" % (config.IO_SCHEMA, year), ["code", "description"], ["char(6)", "text"]).create() table.truncate() filepath = fileutils.getdatapath("io_sectors_%d.csv" % year, "usa") with open(filepath, "r") as fh: csvf = csv.reader(fh) for row in csvf: if len(row) and len(row[0]): table.insert([row[0], row[1]]) if year == 1972: # this is stated in the rtf file for both 1972 and 1977 # but this code never appears in 1977, the documentation # was probably not properly updated table.insert(["870000", "total value added"]) writer = dbsetup.IOCodeTableWriter() writer.set_year(1982, "Io-code.doc") with open(writer.get_filename()) as f: for line in f: if len(line) > 8: code = line[:6] desc = line[8:] writer.writerow(code, desc) writer.set_year(1987, "SIC-IO.DOC") with open(writer.get_filename()) as f: pattern = re.compile('\s*(\d{1,2})\.(\d{4})\s+([^0-9\*]+)') for line in f: match = pattern.match(line) if match: code = match.group(1).rjust(2, '0') + match.group(2) desc = match.group(3).strip('(. \r\n') writer.writerow(code, desc) writer.set_year(1992, "io-code.txt") with open(writer.get_filename()) as f: for line in f: if len(line) > 7: code = line[:6] desc = line[7:] writer.writerow(code, desc) writer.set_year(1997, "IO-CodeDetail.txt") with open(writer.get_filename()) as f: csvf = csv.reader(f) for row in csvf: if len(row) == 2: writer.writerow(row[0], row[1]) writer.set_year(2002, "REV_NAICSUseDetail 4-24-08.txt") with open(writer.get_filename()) as f: valid_line = re.compile("[A-Z0-9]{6}\s") line = f.readline().strip().replace("GasPipeVal", "GasPipe ") fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line)) codemap = {} for line in f: if valid_line.match(line): row = dbsetup.get_values_for_fields(dbsetup.replace_tabs(line), fields) codemap[row["Commodity"]] = row["CommodityDescription"] codemap[row["Industry"]] = row["IndustryDescription"] for (code, desc) in codemap.items(): writer.writerow(code, desc) writer.flush()
def parse_codes(): # parse sector maps path = fileutils.getdatapath("io_env_map.xls", "jp") wb = xlrd.open_workbook(path) io_tables = {} env_tables = {} harmonized_sectors = {} harmonized_table = SQLTable("%s.harmonized_codes" % config.SCHEMA, ["code", "description"], ["char(3)", "varchar(63)"]).create() for year in config.STUDY_YEARS: # all io codes are in one sheet, parse afterward io_tables[year] = SQLTable( "%s.io_map_%d" % (config.SCHEMA, year), ["io_sector", "description", "harmonized"], ["char(3)", "varchar(63)", "char(3)"]).create() io_tables[year].truncate() # parse env codes env_table = SQLTable( "%s.env_map_%d" % (config.SCHEMA, year), ["env_sector", "description", "harmonized"], ["varchar(7)", "varchar(63)", "char(3)"]).create() env_table.truncate() sheet = wb.sheet_by_name(str(year)) for i in range(1, sheet.nrows): row = sheet.row_values(i) code = row[0] if type(code) is float: # 2005 codes are 5 or more digits so this just trims .0 code = str(int(code)).rjust(3, "0") desc = row[1] h_code = row[2] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") env_table.insert([code, desc, h_code]) if h_code not in harmonized_sectors: h_desc = row[3] harmonized_sectors[h_code] = 1 harmonized_table.insert([h_code, h_desc]) sheet = wb.sheet_by_name("io") positions = {} header = sheet.row_values(0) for i in range(len(header)): if type(header[i]) is float: positions[int(header[i])] = i elif header[i] == "harmonized": positions["harmonized"] = i for i in range(1, sheet.nrows): row = sheet.row_values(i) for year in config.STUDY_YEARS: code = row[positions[year]] if type(code) is float: code = str(int(code)).rjust(3, "0") if code is None or not len(code): continue desc = row[positions[year] + 1] h_code = row[positions["harmonized"]] if type(h_code) is float: h_code = str(int(h_code)).rjust(3, "0") io_tables[year].insert([code, desc, h_code])
def parse_env(): # parse english env files # TODO: might want to use the energy table as well. # it is very comprehensive, but formatted differently and only has 2001 sector_whitelist = ("Household Consumption", "Fixed Capital Formation") eng_env_years = [1999, 2001, 2004] eng_env_files = { "air_pol": { "filename": "IO_air.xls", "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"], }, "water_pol": { "filename": "IO_pol_water.xls", "columns": ["BOD", "COD", "SS"], }, "waste_pol": { "filename": "IO_waste.xls", "columns": ["Total waste", "General waste", "Hazardous waste", "Total waste - improper disposal", "General waste - improper disposal", "Hazardous waste - improper disposal"], }, "water_use": { "filename": "IO_res_water.xls", "columns": ["Natural water", "Abstracted water"], }, } tables_by_year = {} for year in eng_env_years: if year not in tables_by_year: tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() tables_by_year[year] = table else: table = tables_by_year[year] first_file = True for (tkey, tdata) in eng_env_files.items(): path = fileutils.getdatapath(tdata["filename"], "tw-env") wb = xlrd.open_workbook(path) sheet = wb.sheet_by_name("year %d" % year) for rowindex in range(sheet.nrows): row = sheet.row_values(rowindex) if len(row) > 1 and \ (regexes.is_num(row[0]) or row[1] in sector_whitelist): sector = row[1].strip() if first_file: # these columns are repeated in every file table.insert([sector, "Total Output", row[2]]) table.insert([sector, "Total Input", row[3]]) table.insert([sector, "GDP", row[4]]) first_file = False for i in range(len(tdata["columns"])): table.insert([sector, tdata["columns"][i], row[i+5]]) # parse chinese env tables # this is file that we created by compiling older chinse data and # manually copying info from latest (2010) pdf files # skip 2001 because the english version is better sheetnames_by_year = { 2000: ["89年空汙", "89年廢棄物"], 2002: ["91年空汙", "91年廢棄物"], 2003: ["92年空汙", "92年廢棄物"], 2010: ["99年空汙", "99年水汙", "99年廢棄物"], } path = fileutils.getdatapath("sheets.xls", "tw-env") wb = xlrd.open_workbook(path) for (year, sheetnames) in sheetnames_by_year.items(): tablename = "%s.env_%d" % (config.SCHEMA, year) table = SQLTable(tablename, ["sector", "series", "value"], ["varchar(55)", "varchar(255)", "float"]) table.create() table.truncate() for sheetname in sheetnames: sheet = wb.sheet_by_name(sheetname) header = sheet.row_values(0) # the 2010 tables have several rows that we don't want should_parse = (year != 2010) for i in range(1, sheet.nrows): row = sheet.row_values(i) if should_parse: sector = row[0].strip() for i in range (1, len(header)): measurement = header[i].strip() value = row[i] table.insert([sector, measurement, value]) elif row[0] in ("依行業分", "依部門分"): should_parse = True