예제 #1
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    # we'll just parse the same file twice,
    # once each for make/use
    for (agglevel, (intermediate, finaldemand)) in io_tables.items():
        colnames = ["year", "industry", "commodity", "value"]

        if agglevel == "detail":
            coltypes = ["int", "varchar(15)", "varchar(15)", "float"]
            colfuncs = {
                "industry": get_industry_code,
                "commodity": get_commodity_code,
                }
        else:
            coltypes = ["int", "varchar(255)", "varchar(255)", "float"]
            colfuncs = {
                "industry": strip_millions, "commodity": strip_millions}

        # parse intermediate
        filename = "%s-eng.csv" % intermediate
        filepath = fileutils.getcache(filename, "ca")
        io_col_map["industry"] = "IND"
        if agglevel == "detail":
            io_col_map["commodity"] = "COMMOD"
        else:
            io_col_map["commodity"] = "COMM"

        csvtable = CSVTable(filepath, True, "cp1252")
        tablename = "%s.io_make_%s" % (config.SCHEMA, agglevel)
        csvtable.create_sql_table(tablename, colnames, coltypes)
        csvtable.parse_to_sql(io_col_map, colfuncs, skip_make, cascade=True)
    
        # we can reuse CSVTable for the same source file
        tablename = "%s.io_use_%s" % (config.SCHEMA, agglevel)
        csvtable.create_sql_table(tablename, colnames, coltypes)
        csvtable.parse_to_sql(io_col_map, colfuncs, skip_use)

        # parse final demand
        filename = "%s-eng.csv" % finaldemand
        filepath = fileutils.getcache(filename, "ca")
        io_col_map["commodity"] = "COMM"
        io_col_map["industry"] = "CAT"

        fdtable = CSVTable(filepath, True, "cp1252")
        tablename = "%s.io_fd_%s" % (config.SCHEMA, agglevel)
        fdtable.create_sql_table(tablename, colnames, coltypes)


        if agglevel == "detail":
            colfuncs["industry"] = get_fd_industry_code

        fdtable.parse_to_sql(io_col_map, colfuncs, skip_finaldemand)
예제 #2
0
def parse_tables():
    files = fileutils.getcachecontents("io-annual")
    for filename in files:
        path = fileutils.getcache(filename, "io-annual")
        print(path)
        table = CSVTable(path, False)

        make_year = is_make(filename)
        use_year = is_use(filename)
        if make_year:
            table.create_sql_table(
                "%s.annual_make_%s" % (config.IO_SCHEMA, make_year),
                ["industry", "commodity", "value"],
                ["varchar(6)", "varchar(6)", "float"])

        elif use_year:
            table.create_sql_table(
                "%s.annual_use_%s" % (config.IO_SCHEMA, use_year),
                ["commodity", "industry", "value"],
                ["varchar(6)", "varchar(6)", "float"])

        elif filename == "codes.csv":
            table.create_sql_table(
                "%s.annual_codes" % config.IO_SCHEMA,
                ["code", "description"],
                ["varchar(6)", "text"])

        else:
            continue

        table.parse_to_sql()
예제 #3
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    # choose 中分類 for all io tables.
    # 中分類 for 1990 and 1995 don't break down the electronic
    # sectors as far as i would like, so use 小分類
    files = {
        1990: "l00_21.xls",
        1995: "l00_21.xls",
        2000: "io00a301.xls",
        2005: "io05a301.xls",
    }

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        # 1995 and 2000 io tables: easiest
        tables.add_io_table(year)
        codes = tables.new_sector_codes(year)

        # for 1995 use the heisei 2-7-12 file since it has more
        # harmonized sectors than the standalone 1995 file
        if year == 1995:
            sheetindex = 2
        else:
            # the first page of the heisei 2-7-12 file (used for 1990)
            # happens to be 1990 at nominal prices, matching the others
            sheetindex = 0

        path = fileutils.getcache(filename, "jp", str(year))
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(sheetindex)
        ind_names = None
        ind_codes = None
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if ind_codes is None:
                for cell in row:
                    if cell == 1:
                        ind_codes = [str(c).strip().rjust(3, "0") for c in row]
                        break
                    if cell.strip() == "001":
                        ind_codes = row
                        break
            elif ind_names is None:
                ind_names = row
                temp_codes = [None, None]
                for i in range(2, len(row)):
                    temp_codes.append(codes.set_code(ind_codes[i], row[i]))
                ind_codes = temp_codes
            else:
                from_code = row[0]
                if type(from_code) is float:
                    from_code = str(int(from_code)).rjust(3, "0")
                from_code = codes.set_code(from_code, row[1])
                if from_code:
                    for i in range(2, len(row)):
                        to_code = ind_codes[i]
                        value = row[i]
                        tables.insert_io(year, from_code, to_code, value)

        codes.update_codes()
예제 #4
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    filename = "rftghgemissions.xls"
    path = fileutils.getcache(filename, "uk")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()

    tables = HybridTableCreator(config.SCHEMA)
    codes = tables.new_sector_codes(prefix="env_ind")

    codes.add_curated_codes({
            "Manufacture of petrochemicals": "20.1[467]+20.6",
            "Manufacture of other basic metals & casting (excl. Nuclear fuel & Aluminium)": "24.4[^26]-5",
            "Rest of repair; Installation": "33.1[^56]",
            })

    for sheet in sheets:
        series = sheet.name
        years = None
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if len(row) < 3 or type(row[2]) is str and not len(row[2]):
                continue
            if years is None:
                if type(row[2]) is float:
                    years = row
                    for year in row[2:]:
                        #envtable.add_env_table("env", year)
                        tables.add_env_table(year)
            else:
                code = codes.set_code(row[0], row[1])
                if code:
                    for i in range(2, len(row)):
                        tables.insert_env(years[i], code, series, row[i])

    codes.update_codes()
예제 #5
0
파일: eia_annual.py 프로젝트: sonya/eea
def parse_measurement(filename, measurement, tracker):
    filepath = fileutils.getcache(filename)
    with open(filepath) as f:
        csvf = csv.reader(f)
        header = next(csvf)
        for stryear in header[2:]:
            year = int(stryear)
            if year not in data:
                data[year] = {}
    
        for row in csvf:
            if len(row) == len(header):
                if row[0] == "US":
                    msn = row[1][:4]
                    for i in range(2, len(row)):
                        year = int(header[i])
                        value = row[i].strip()
                        if len(value):
                            if msn not in data[year]:
                                data[year][msn] = {measurement: value}
                            else:
                                data[year][msn][measurement] = value

                            source = msn[0:2]
                            sector = msn[2:4]
                            insert_values = [year, source, sector, float(value)]
                            if measurement == "price":
                                tracker.insert_row(pricetable, insert_values)
                            elif measurement == "use_btu":
                                tracker.insert_row(usetable, insert_values)
예제 #6
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    tables = HybridTableCreator(config.SCHEMA)

    codes = tables.new_sector_codes(prefix="ind")
    codes.add_curated_codes(config.curated_sectors)
    codes.blacklist_code("Differences between totals and sums of components are due to rounding")

    filename = "bb09-su-tables-1992-2003.xls"
    path = fileutils.getcache(filename, "uk")
    wb = xlrd.open_workbook(path)
    for year in range(1992, 2004):
        parse_ixi_year(tables, codes, wb, year)

    filename = "input-output-supply-and-use-tables--2004-2008.xls"
    path = fileutils.getcache(filename, "uk")
    wb = xlrd.open_workbook(path)
    for year in range(2004, 2009):
        parse_ixi_year(tables, codes, wb, year)

    codes.update_codes()
예제 #7
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    tables = {}

    for year in config.STUDY_YEARS:
        tablename = "%s.env_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "measurement", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(31)", "float"]
        tables[year] = SQLTable(tablename, colnames, coltypes).create()
        tables[year].truncate()

    countries = sorted(config.countries.keys())
    countries.append("ROW")  # rest of world

    for (series, attribs) in config.env_series.items():
        if "dir" in attribs:
            subdir = attribs["dir"]
        else:
            subdir = series
        subdir = os.path.join("wiod", subdir)
        skip_name = "skip_name" in attribs and attribs["skip_name"]

        for country in config.countries.keys():
            filename = "%s_%s_May12.xls" % (country, series)
            print(filename)
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            for year in config.STUDY_YEARS:
                sheet = wb.sheet_by_name("%d" % year)
                measurements = sheet.row_values(0)
                if series == "EU":
                    measurements = [m + " - Gross" for m in measurements]
                elif series == "CO2":
                    measurements = ["CO2 - " + m for m in measurements]

                for i in range(1, sheet.nrows):
                    row = sheet.row_values(i)
                    if len(row[0].strip()):
                        if skip_name:
                            ind_code = row[0]
                            first_col = 1
                        else:
                            ind_name = row[0]
                            ind_code = row[1]
                            industry_tracker.set_code(ind_code, ind_name)
                            first_col = 2

                        for j in range(first_col, len(row)):
                            value = row[j]
                            if type(value) is float and value != 0:
                                measurement = measurements[j]
                                tables[year].insert(
                                    [country, ind_code, measurement, value])
예제 #8
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    tables = {}

    for year in config.STUDY_YEARS:
        tablename = "%s.env_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "measurement", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(31)", "float"]
        tables[year] = SQLTable(tablename, colnames, coltypes).create()
        tables[year].truncate()

    countries = sorted(config.countries.keys())
    countries.append("ROW") # rest of world

    for (series, attribs) in config.env_series.items():
        if "dir" in attribs:
            subdir = attribs["dir"]
        else:
            subdir = series
        subdir = os.path.join("wiod", subdir)
        skip_name = "skip_name" in attribs and attribs["skip_name"]

        for country in config.countries.keys():
            filename = "%s_%s_May12.xls" % (country, series)
            print(filename)
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            for year in config.STUDY_YEARS:
                sheet = wb.sheet_by_name("%d" % year)
                measurements = sheet.row_values(0)
                if series == "EU":
                    measurements = [m + " - Gross" for m in measurements]
                elif series == "CO2":
                    measurements = ["CO2 - " + m for m in measurements]

                for i in range(1, sheet.nrows):
                    row = sheet.row_values(i)
                    if len(row[0].strip()):
                        if skip_name:
                            ind_code = row[0]
                            first_col = 1
                        else:
                            ind_name = row[0]
                            ind_code = row[1]
                            industry_tracker.set_code(ind_code, ind_name)
                            first_col = 2

                        for j in range(first_col, len(row)):
                            value = row[j]
                            if type(value) is float and value != 0:
                                measurement = measurements[j]
                                tables[year].insert(
                                    [country, ind_code, measurement, value])
예제 #9
0
def doparse():
    country_dict = dict((v, k) for k, v in config.countries.items())
    country_dict["Slovakia"] = "SVK"

    sources = ["total", "nuclear", "thermal", "renewable",
               "geothermal", "solar", "wind", "biomass"]
    measurements = ["capacity", "consumption"]

    tablename = "%s.world_power" % ("eia")
    table = SQLTable(
        tablename,
        ["year", "country", "source", "units", "value"],
        ["int", "char(3)", "varchar(15)", "varchar(4)", "float"])
    table.create()
    table.truncate()

    for source in sources:
        for measure in measurements:
            if measure == "consumption":
                if source in ("geothermal", "solar", "wind", "biomass"):
                    continue

                units = "bkWh"
            elif measure == "capacity":
                units = "MkW"

            filename = source + "_" + measure + ".xls"
            path = fileutils.getcache(filename, "eia")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_index(0)
            header = None
            for i in range(sheet.nrows):
                row = sheet.row_values(i)
                if header is None:
                    if len(row) > 2 and type(row[2]) is float:
                        header = []
                        for cell in row:
                            if type(cell) is float:
                                header.append(int(cell))
                            else:
                                header.append(None)
                        header_len = len(header)
                elif len(row) > 2:
                    country_name = row[0]
                    if country_name in country_dict:
                        country = country_dict[country_name]
                        for i in range(2, header_len):
                            value = row[i]
                            year = header[i]
                            if type(value) is float and value > 0:
                                table.insert(
                                    [year, country, source, units, value])
예제 #10
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    tables = HybridTableCreator(config.SCHEMA)

    codes = tables.new_sector_codes(prefix="ind")
    codes.add_curated_codes(config.curated_sectors)
    codes.blacklist_code(
        "Differences between totals and sums of components are due to rounding"
    )

    filename = "bb09-su-tables-1992-2003.xls"
    path = fileutils.getcache(filename, "uk")
    wb = xlrd.open_workbook(path)
    for year in range(1992, 2004):
        parse_ixi_year(tables, codes, wb, year)

    filename = "input-output-supply-and-use-tables--2004-2008.xls"
    path = fileutils.getcache(filename, "uk")
    wb = xlrd.open_workbook(path)
    for year in range(2004, 2009):
        parse_ixi_year(tables, codes, wb, year)

    codes.update_codes()
예제 #11
0
파일: dbsetup.py 프로젝트: sonya/eea
    def create_simple_make_use(self, year, filename, factor=1):
        self.create_make_table(year)
        self.create_use_table(year, has_margins=False)
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) == 4:
                    input_ind = cols[0]    # comm consumed (producing ind)
                    output_ind = cols[1]   # consuming ind (comm produced) 
                    use_dollars = cols[2]  # use in producers' prices
                    make_dollars = cols[3] # make in producers' prices

                self.insert_make(input_ind, output_ind, make_dollars, factor)
                self.insert_use(commod=input_ind, indus=output_ind,
                                useval=use_dollars, factor=factor)
예제 #12
0
파일: dbsetup.py 프로젝트: sonya/eea
    def create_simple_make_use(self, year, filename, factor=1):
        self.create_make_table(year)
        self.create_use_table(year, has_margins=False)
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) == 4:
                    input_ind = cols[0]  # comm consumed (producing ind)
                    output_ind = cols[1]  # consuming ind (comm produced)
                    use_dollars = cols[2]  # use in producers' prices
                    make_dollars = cols[3]  # make in producers' prices

                self.insert_make(input_ind, output_ind, make_dollars, factor)
                self.insert_use(commod=input_ind,
                                indus=output_ind,
                                useval=use_dollars,
                                factor=factor)
예제 #13
0
파일: parser.py 프로젝트: sonya/eea
def parse_codes():
    ## manually curated sector map
    table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA,
                     ["io_code", "env_code", "description"],
                     ["varchar(15)", "varchar(15)", "text"]).create()
    table.truncate()

    sector_map = fileutils.getdatapath("sector_map.csv", "wiod")
    fh = open(sector_map, "r")
    csvf = csv.reader(fh)
    header = next(csvf)
    for row in csvf:
        io_code = row[0].strip()
        if not len(io_code):
            io_code = None
        env_code = row[1].strip()
        if not len(env_code):
            env_code = None
        desc = row[2].strip()
        table.insert([io_code, env_code, desc])

    ## current exchange rates
    table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA,
                     ["country", "year", "rate"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    path = fileutils.getcache("exr_wiod.xls", "wiod")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_name("EXR")
    year_list = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 2:
            continue
        if year_list is None:
            if type(row[0]) is str and row[0].strip() == "Country":
                year_list = [int(cell.strip("_ ")) for cell in row[2:]]
        else:
            if type(row[1]) is str and len(row[1].strip()) == 3:
                country = row[1]
                if country == "GER":
                    country = "DEU"
                for (year, value) in zip(year_list, row[2:]):
                    table.insert([country, year, value])
예제 #14
0
파일: parser.py 프로젝트: sonya/eea
def parse_codes():
    ## manually curated sector map
    table = SQLTable("%s.sector_map" % config.WIOD_SCHEMA,
                     ["io_code", "env_code", "description"],
                     ["varchar(15)", "varchar(15)", "text"]).create()
    table.truncate()

    sector_map = fileutils.getdatapath("sector_map.csv", "wiod")
    fh = open(sector_map, "r")
    csvf = csv.reader(fh)
    header = next(csvf)
    for row in csvf:
        io_code = row[0].strip()
        if not len(io_code):
            io_code = None
        env_code = row[1].strip()
        if not len(env_code):
            env_code = None
        desc = row[2].strip()
        table.insert([io_code, env_code, desc])

    ## current exchange rates
    table = SQLTable("%s.exchange_rates" % config.WIOD_SCHEMA,
                     ["country", "year", "rate"],
                     ["char(3)", "int", "float"]).create()
    table.truncate()

    path = fileutils.getcache("exr_wiod.xls", "wiod")
    wb = xlrd.open_workbook(path)
    sheet = wb.sheet_by_name("EXR")
    year_list = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 2:
            continue
        if year_list is None:
            if type(row[0]) is str and row[0].strip() == "Country":
                year_list = [int(cell.strip("_ ")) for cell in row[2:]]
        else:
            if type(row[1]) is str and len(row[1].strip()) == 3:
                country = row[1]
                if country == "GER":
                    country = "DEU"
                for (year, value) in zip(year_list, row[2:]):
                    table.insert([country, year, value])
예제 #15
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    for (tablecode, tablespec) in eea_tables.items():
        filename = "%s-eng.csv" % tablecode
        filepath = fileutils.getcache(filename, "ca")
        csvtable = CSVTable(filepath, True)
    
        tablename = "%s.%s" % (config.SCHEMA, tablespec["tablename"])
        csvtable.create_sql_table(tablename,
                                  ["year", "industry", "value"],
                                  ["int", "varchar(255)", "float"])

        col_funcs = {"industry": get_industry_code}
        col_map = tablespec["col_map"]
        skip_callback = None
        if "skip_callback" in tablespec:
            skip_callback = tablespec["skip_callback"]
    
        csvtable.parse_to_sql(col_map, col_funcs, skip_callback)
예제 #16
0
파일: parser.py 프로젝트: sonya/eea
 def parse_sut(sheet_name, table_prefix):
     tables = {}
     colnames = ["country", "commodity", "industry", "value"]
     coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
     for year in config.STUDY_YEARS:
         tablename = "%s_%d" % (table_prefix, year)
         tables[year] = SQLTable(tablename, colnames, coltypes).create()
         tables[year].truncate()
 
     for country in config.countries.keys():
         # TODO: more automated way to get this
         if country in ("AUS", "DEU", "GBR", "USA"):
             filename = "%s_SUT_Feb12.xls" % country
         else:
             filename = "%s_SUT_Jan12.xls" % country
         subdir = os.path.join("wiod", "suts")
         path = fileutils.getcache(filename, subdir)
         wb = xlrd.open_workbook(path)
 
         # extract supply and use tables at fob prices
         sheet = wb.sheet_by_name(sheet_name)
         industry_row = sheet.row_values(0)
         row = sheet.row_values(1)
         industry_codes = []
         for (code, desc) in zip(industry_row, row):
             industry_codes.append(industry_tracker.set_code(code, desc))
 
         for i in range(2, sheet.nrows):
             row = sheet.row_values(i)
             if not len(row[0].strip()):
                 continue
             year = int(row[0])
             if year not in config.STUDY_YEARS:
                 continue
             com_code = commodity_tracker.set_code(row[1], row[2])
             if not com_code:
                 continue
             for j in range(3, len(row)):
                 value = row[j]
                 ind_code = industry_codes[j]
                 if value != 0 and ind_code:
                     # commodity first
                     tables[year].insert(
                         [country, com_code, ind_code, value])
예제 #17
0
파일: parser.py 프로젝트: sonya/eea
    def parse_sut(sheet_name, table_prefix):
        tables = {}
        colnames = ["country", "commodity", "industry", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        for year in config.STUDY_YEARS:
            tablename = "%s_%d" % (table_prefix, year)
            tables[year] = SQLTable(tablename, colnames, coltypes).create()
            tables[year].truncate()

        for country in config.countries.keys():
            # TODO: more automated way to get this
            if country in ("AUS", "DEU", "GBR", "USA"):
                filename = "%s_SUT_Feb12.xls" % country
            else:
                filename = "%s_SUT_Jan12.xls" % country
            subdir = os.path.join("wiod", "suts")
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            # extract supply and use tables at fob prices
            sheet = wb.sheet_by_name(sheet_name)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                if not len(row[0].strip()):
                    continue
                year = int(row[0])
                if year not in config.STUDY_YEARS:
                    continue
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        tables[year].insert(
                            [country, com_code, ind_code, value])
예제 #18
0
파일: dbsetup.py 프로젝트: sonya/eea
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename, ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print("%d rows inserted" % insert_count)
예제 #19
0
파일: dbsetup.py 프로젝트: sonya/eea
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename,
                          ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print ("%d rows inserted" % insert_count)
예제 #20
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    filename = "rftghgemissions.xls"
    path = fileutils.getcache(filename, "uk")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()

    tables = HybridTableCreator(config.SCHEMA)
    codes = tables.new_sector_codes(prefix="env_ind")

    codes.add_curated_codes({
        "Manufacture of petrochemicals":
        "20.1[467]+20.6",
        "Manufacture of other basic metals & casting (excl. Nuclear fuel & Aluminium)":
        "24.4[^26]-5",
        "Rest of repair; Installation":
        "33.1[^56]",
    })

    for sheet in sheets:
        series = sheet.name
        years = None
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if len(row) < 3 or type(row[2]) is str and not len(row[2]):
                continue
            if years is None:
                if type(row[2]) is float:
                    years = row
                    for year in row[2:]:
                        #envtable.add_env_table("env", year)
                        tables.add_env_table(year)
            else:
                code = codes.set_code(row[0], row[1])
                if code:
                    for i in range(2, len(row)):
                        tables.insert_env(years[i], code, series, row[i])

    codes.update_codes()
예제 #21
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    io_files = {
        1996: "410281134571.xls",
        1999: "4102715414971.xls",
        2001: "4122111363671.xls",
        2004: "611239581071.xls",
        2006: "9121414285971.xls",
        2007: "1139203871.xls",
        2008: "1139204871.xls",
        2009: "11229101502.xls",
        2010: "1122910141371.xls",
    }

    for (year, filename) in io_files.items():
        tablename = "%s.io_%d" % (config.SCHEMA, year)

        # millions are in NTD
        table = SQLTable(tablename, ["from_sector", "to_sector", "millions"],
                         ["varchar(255)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        path = fileutils.getcache(filename, "tw/%d" % year)
        wb = xlrd.open_workbook(path)
        sheet = wb.sheets()[0]
        to_codes = sheet.row_values(0)
        to_names = sheet.row_values(1)
        for rowindex in range(2, sheet.nrows):
            row = sheet.row_values(rowindex)
            from_code = row[0].strip()
            from_name = row[1].strip()
            for i in range(2, len(to_names)):
                to_name = to_names[i].strip()
                value = row[i]
                table.insert([from_name, to_name, value])

        if year == 2010:
            strings = {
                "viewname": "%s.io_view_%d" % (config.SCHEMA, year),
                "tablename": tablename,
                "maptable": "%s.sector_map_%d" % (config.SCHEMA, year),
                "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]),
                "from_blacklist":
                sqlhelper.set_repr(config.from_blacklists[year]),
            }

            sql = """CREATE OR REPLACE VIEW %(viewname)s AS
                SELECT from_map.io_sector AS from_sector,
                       to_map.io_sector as to_sector,
                       sum(millions) as millions
                  FROM %(tablename)s io,
                       (SELECT DISTINCT io_sector, io_commod
                          FROM %(maptable)s) from_map,
                       (SELECT DISTINCT io_sector, io_ind
                          FROM %(maptable)s) to_map
                 WHERE io.to_sector NOT IN %(to_blacklist)s
                   AND io.from_sector NOT IN %(from_blacklist)s
                   AND from_map.io_commod = io.from_sector
                   AND to_map.io_ind = io.to_sector
                 GROUP BY from_map.io_sector, to_map.io_sector""" % strings

            print(sql)
            db.execute(sql)
예제 #22
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue
    
        db_table = SQLTable("cn.emissions_%d" % year,
                            ["industry_zh", "industry_en",
                             "pollutant", "amount"],
                            ["varchar(1023)", "varchar(1023)",
                             "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()
    
        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]
    
            for (pollutant, amount) in zip(columns[max_sector_column+1:],
                                           rowdata[max_sector_column+1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])
    
        xact = db.xact(mode="READ WRITE")
        xact.begin()
    
        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb") # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)
    
            print(adir, filename)
            title = soup.title.string
    
            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break
    
            columns = None
            did_have_numbers = False # true after we've parsed through
            max_sector_column = 0 # 1 if english separate, 0 otherwise
    
            prev_rowdata = None
            prev_rowspans = None
            data = []
    
            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None
    
            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue
    
                rowspans = []
                rowdata = []
    
                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None
    
                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)
    
                for cellpos in range(rowlen):
                    cell = cells[cellpos]
    
                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])
    
                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')
    
                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]
    
                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(-abs(
                                    abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)
    
                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)
    
                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan: # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)
    
                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()
    
                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans
    
                if len(rowdata) == 0:
                    continue
    
                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue
    
                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0: # all blank strings
                    continue
    
                # if we're sure we have columns, clean up rowdata so 
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1]*len(rowspans)
    
                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break
    
                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i-1] + " - "
    
                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]
    
                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])
    
                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None
    
                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")
    
                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1
    
                elif insert_now is not None and len(insert_now) == len(columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)
    
            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)
    
            print(columns)
    
        xact.commit()
예제 #23
0
파일: dbsetup.py 프로젝트: sonya/eea
 def get_filename(self):
     filepath = os.path.join(str(self.year), self.filename)
     return fileutils.getcache(filepath)
예제 #24
0
파일: total_energy.py 프로젝트: sonya/eea
 def get_overlay_data_location(self):
     dataname = "%s-overlay.dat" % self.filename
     return fileutils.getcache(dataname, "gnuplot")
예제 #25
0
파일: parser.py 프로젝트: sonya/eea
def parse_int():
    for year in config.STUDY_YEARS:
        tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year)
        colnames = [
            "from_country", "to_country", "commodity", "industry", "value"
        ]
        coltypes = [
            "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float"
        ]
        use_table = SQLTable(tablename, colnames, coltypes).create()

        tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "commodity", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        make_table = SQLTable(tablename, colnames, coltypes).create()

        filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4]
        subdir = os.path.join("wiod", "intsuts_analytic")
        path = fileutils.getcache(filename, subdir)
        wb = xlrd.open_workbook(path)

        for country in config.countries.keys():
            sheet = wb.sheet_by_name("USE_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)

                # notes say Use tables are broken down by origin
                from_country = row[1]

                # stupid hack so i don't have to change char(3)
                if from_country == "ZROW":
                    from_country = "RoW"

                com_code = commodity_tracker.set_code(row[2], row[3])
                if not com_code:
                    continue
                for j in range(4, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        use_table.insert(
                            [from_country, country, com_code, ind_code, value])

            sheet = wb.sheet_by_name("SUP_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # industry first
                        make_table.insert([country, ind_code, com_code, value])
예제 #26
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    ### for ind x ind tables
    tables = {}
    colnames = ["country", "from_ind", "to_ind", "is_import", "value"]
    coltypes = ["char(3)", "varchar(15)", "varchar(15)", "bool", "float"]
    for year in config.STUDY_YEARS:
        tablename = "%s.niot_%d" % (config.WIOD_SCHEMA, year)
        tables[year] = SQLTable(tablename, colnames, coltypes)  #.create()
        tables[year].drop()
        tables[year].create()
        tables[year].truncate()

    va_sectors = set(config.va_sectors.values())

    for country in config.countries.keys():
        filename = "%s_NIOT_ROW_Apr12.xlsx" % country
        subdir = os.path.join("wiod", "niot")
        path = fileutils.getcache(filename, subdir)
        wb = openpyxl.load_workbook(filename=path, use_iterators=True)
        for year in config.STUDY_YEARS:
            imports = {}

            sheet = wb.get_sheet_by_name("%d" % year)
            rows = sheet.iter_rows()
            industry_row = None
            for row in rows:
                cell = row[0]
                if cell.internal_value == "(industry-by-industry)":
                    industry_row = row
                    break
            row = next(rows)  # industry names
            industry_codes = []
            for (code_cell, desc_cell) in zip(industry_row, row):
                code = code_cell.internal_value
                desc = desc_cell.internal_value
                industry_codes.append(industry_tracker.set_code(code, desc))

            for row in rows:
                from_code = None
                from_desc = None
                is_import = False
                for (to_code, value_cell) in zip(industry_codes, row):
                    column = value_cell.column
                    value = value_cell.internal_value
                    # excel columns use letters
                    if column == "A":
                        from_code = value_cell.internal_value
                    elif column == "B":
                        from_desc = value_cell.internal_value
                    elif column == "C":
                        from_code = industry_tracker.set_code(
                            from_code, from_desc)
                        if not from_code:
                            break
                        if type(value) is str and value == "Imports":
                            is_import = True
                    elif (column > "D" or len(column) > 1) \
                            and to_code and value != 0:
                        tables[year].insert(
                            [country, from_code, to_code, is_import, value])

    ### for supply and use tables
    def parse_sut(sheet_name, table_prefix):
        tables = {}
        colnames = ["country", "commodity", "industry", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        for year in config.STUDY_YEARS:
            tablename = "%s_%d" % (table_prefix, year)
            tables[year] = SQLTable(tablename, colnames, coltypes).create()
            tables[year].truncate()

        for country in config.countries.keys():
            # TODO: more automated way to get this
            if country in ("AUS", "DEU", "GBR", "USA"):
                filename = "%s_SUT_Feb12.xls" % country
            else:
                filename = "%s_SUT_Jan12.xls" % country
            subdir = os.path.join("wiod", "suts")
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)

            # extract supply and use tables at fob prices
            sheet = wb.sheet_by_name(sheet_name)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))

            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                if not len(row[0].strip()):
                    continue
                year = int(row[0])
                if year not in config.STUDY_YEARS:
                    continue
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        tables[year].insert(
                            [country, com_code, ind_code, value])

    # make tables
    parse_sut("SUP_bas", "%s.make" % config.WIOD_SCHEMA)

    # use tables
    parse_sut("USE_bas", "%s.use" % config.WIOD_SCHEMA)
예제 #27
0
파일: parser.py 프로젝트: sonya/eea
def parse_int():
    for year in config.STUDY_YEARS:
        tablename = "%s.int_use_%d" % (config.WIOD_SCHEMA, year)
        colnames = [
            "from_country", "to_country", "commodity", "industry", "value"]
        coltypes = [
            "char(3)", "char(3)", "varchar(15)", "varchar(15)", "float"]
        use_table = SQLTable(tablename, colnames, coltypes).create()

        tablename = "%s.int_make_%d" % (config.WIOD_SCHEMA, year)
        colnames = ["country", "industry", "commodity", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        make_table = SQLTable(tablename, colnames, coltypes).create()

        filename = "IntSUT%s_row_Apr12.xls" % str(year)[2:4]
        subdir = os.path.join("wiod", "intsuts_analytic")
        path = fileutils.getcache(filename, subdir)
        wb = xlrd.open_workbook(path)

        for country in config.countries.keys():
            sheet = wb.sheet_by_name("USE_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))
    
            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)

                # notes say Use tables are broken down by origin
                from_country = row[1]

                # stupid hack so i don't have to change char(3)
                if from_country == "ZROW":
                    from_country = "RoW"

                com_code = commodity_tracker.set_code(row[2], row[3])
                if not com_code:
                    continue
                for j in range(4, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        use_table.insert(
                            [from_country, country, com_code, ind_code, value])

            sheet = wb.sheet_by_name("SUP_%s" % country)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))
    
            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # industry first
                        make_table.insert(
                            [country, ind_code, com_code, value])
예제 #28
0
def doparse():

    tablename = "%s.world_supplement" % config.WIOD_SCHEMA
    table = SQLTable(tablename, ["year", "country", "measurement", "value"],
                     ["int", "char(3)", "varchar(8)", "float"])
    table.create()
    table.truncate()

    # census data has more complete population counts
    country_fips = {
        "LU": "LUX",
        "US": "USA",
        "NL": "NLD",
        "AU": "AUT",
        "SW": "SWE",
        "CA": "CAN",
        "AS": "AUS",
        "EI": "IRL",
        "GM": "DEU",
        "BE": "BEL",
        "TW": "TWN",
        "DA": "DNK",
        "UK": "GBR",
        "FR": "FRA",
        "JA": "JPN",
        "KS": "KOR",
        "SP": "ESP",
        "CY": "CYP",
        "SI": "SVN",
        "EZ": "CZE",
        "GR": "GRC",
        "MT": "MLT",
        "PO": "PRT",
        "LO": "SVK",
        "PL": "POL",
        "EN": "EST",
        "HU": "HUN",
        "LH": "LTU",
        "LG": "LVA",
        "MX": "MEX",
        "TU": "TUR",
        "BR": "BRA",
        "RO": "ROU",
        "BU": "BGR",
        "CH": "CHN",
        "ID": "IDN",
        "IN": "IND",
        "RS": "RUS",
        "FI": "FIN",
        "IT": "ITA",
    }

    # this file spec is documented in the xlsx file from the archive
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    country = country_fips[fips]
                    table.insert([year, country, "pop", int(fields[2])])

    # worldbank data has some deflator data that imf doesn't
    worldbank = {
        "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls",
        #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls",
        #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls",
        "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls",
    }

    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in config.countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    if type(value) is float and value != 0:
                        table.insert([year, country, indicator, value])

    imf_fields = (
        "LP",  # population
        "PPPPC",  # ppp per capita
        "NGDPRPC",  # gdp per capita in constant prices
        "NGDP_D",  # gdp deflator
    )

    # this is actually a csv file despite what it's called
    path = fileutils.getcache("WEOApr2012all.xls", "wsupp")

    with codecs.open(path, "r", "cp1252") as fh:
        csvf = csv.reader(fh, dialect=csv.excel_tab)
        header = next(csvf)
        year_cols = {}

        valid_year = re.compile("\d{4}")
        valid_float = re.compile("-*[\d\.,]+")

        for i in range(len(header)):
            if header[i] == "ISO":
                country_col = i
            elif header[i] == "WEO Subject Code":
                subject_col = i
            elif valid_year.match(header[i]):
                year_cols[int(header[i])] = i
            elif header[i] == "Estimates Start After":
                last_year_col = i

        for row in csvf:
            if len(row) > subject_col and row[subject_col] in imf_fields:
                field = row[subject_col]
                country = row[country_col]
                if country not in config.countries:
                    continue
                if valid_year.match(row[last_year_col]):
                    last_year = int(row[last_year_col])
                else:
                    # not clear if this means all values are estimated
                    last_year = 9999
                for (year, colnum) in year_cols.items():
                    value = row[colnum]
                    if valid_float.match(value):  #and year < last_year:
                        table.insert([
                            year, country, field,
                            float(value.replace(",", ""))
                        ])
예제 #29
0
파일: dbsetup.py 프로젝트: sonya/eea
 def get_filename(self):
     filepath = os.path.join(str(self.year), self.filename)
     return fileutils.getcache(filepath)
예제 #30
0
파일: pcebridge.py 프로젝트: sonya/eea
 def set_filename(self, filename):
     path = fileutils.getcache(filename, str(self.year))
     self.filename = path
예제 #31
0
파일: pcebridge.py 프로젝트: sonya/eea
def parse_nipa_data():
    test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA
    db.execute("DROP VIEW IF EXISTS %s" % test_view)

    # get table for pce category harmonization
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')
    
    nipa_code_map = {}
    filename = fileutils.getdatapath("nipa_code_map.csv", "usa")
    fh = open(filename)
    csvf = csv.reader(fh)
    for row in csvf:
        if len(row) == 2:
            harmonized = row[0]
            trailing = trailing_pat.match(harmonized)
            if trailing:
                harmonized = trailing.group(1)
            nipa_code_map[row[1]] = harmonized
    fh.close()
    
    # get nipa series codes from underlying detail tables
    tracker = TableStateTracker()
    tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA,
                         ["code", "parent", "description"],
                         ["char(7)", "char(7)", "text"],
                         True)

    number_pat = re.compile('^\d+$')
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')
    
    filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False
    
    code_stack = [None]
    indent_stack = [-1]
    
    # the code mapping has been done such that each item is at least at
    # three levels of disaggregation below the top, i.e. there is always
    # an ancestor at the second level. we only want to keep track of the
    # ancestor at the third level (root is zero)
    # the first level below root has goods and services
    # the second level has durable goods, nondurable goods, and services.
    reverse_code_dict = {}
    second_level_nodes = []
    
    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 2.4.5U"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 2.4.5U"):
                    # we only need to go through one instance of this table
                    break
                else:
                    if number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()
    
                        # these are duplicate codes
                        if title.startswith("Market-based PCE"):
                            continue
    
                        code = row[2]
                        current_indent = len(row[1]) - len(title)
    
                        while current_indent <= indent_stack[-1]:
                            indent_stack.pop()
                            code_stack.pop()
    
                        indent_stack.append(current_indent)
                        code_stack.append(code)
    
                        if len(code_stack) > 1:
                            parent = code_stack[-2]
                        else:
                            parent = None
    
                        title = title.strip()
                        trailing = trailing_pat.match(title)
                        if trailing:
                            title = trailing.group(1)
                        
                        if len(code_stack) > 4:
                            reverse_code_dict[title] = code_stack[3]
                        else:
                            reverse_code_dict[title] = code
    
                        tracker.insert_row((code, parent, title))
    
    tracker.flush()
    fh.close()
    
    # table for price deflators
    
    tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA,
                         ["year", "gdp", "pce"],
                         ["int", "float", "float"])
    
    filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False
    
    data = {} # we need to parse two rows before we can populate
    years = {}
    
    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 1.1.9"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 1.1.9"):
                    # this is seasonally adjusted version of the same table
                    break
                else:
                    if row[0] == "Line":
                        for i in range(len(row)):
                            if number_pat.match(row[i]):
                                year = int(row[i])
                                years[year] = i
                                data[year] = {}
    
                    elif number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()
                        if title == "Gross domestic product":
                            column = "gdp"
                        elif title == "Personal consumption expenditures":
                            column = "pce"
                        else:
                            continue
    
                        for (year, colindex) in years.items():
                            data[year][column] = float(row[colindex])
    
    for (year, results) in data.items():
        tracker.insert_row([year, results["gdp"], results["pce"]])
    
    tracker.flush()
    fh.close()
    
    # parse pce bridge
    
    class IONIPAStateTracker(TableStateTracker):
    
        def flush(self):
            TableStateTracker.flush(self)
            if self.fh is not None and not self.fh.closed:
                self.fh.close()
    
        def __init__(self):
            TableStateTracker.__init__(self)
            self.fh = None
            self.code_dict = None
    
            self.value_columns = [
                "prod_val",
                "rail_margin",
                "truck_margin",
                "water_margin",
                "air_margin",
                "pipe_margin",
                "gaspipe_margin",
                "wholesale_margin",
                "retail_margin",
                "purchase_val"
                ]
    
            self.old_style_field_map = {
                "Producers' Value": "prod_val",
                "MfgExciseTax": "prod_val",
                "RailMargin": "rail_margin",
                "TruckMargin": "truck_margin",
                "WaterMargin": "water_margin",
                "AirMargin": "air_margin",
                "PipeMargin": "pipe_margin",
                "WholesaleMargin": "wholesale_margin",
                "WholesaleTax": "wholesale_margin",
                "RetailMargin": "retail_margin",
                "RetailSalesTax": "retail_margin",
                "OtherRetailTax": "retail_margin",
                "Purchasers' Value": "purchase_val",
                }
    
        def set_filename(self, filename):
            path = fileutils.getcache(filename, str(self.year))
            self.filename = path
    
        def set_year(self, year):
            self.flush()
            self.year = year
            tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year)
            fields = ["pce_code", "commodity"] + self.value_columns
            types = ["varchar(6)", "varchar(6)"] + \
                ["bigint"]*len(self.value_columns)
            self.create_table(tablename, fields, types)
    
        def setup_for_codes(self):
            self.code_dict = {}
    
        def flush_codes(self):
            if self.code_dict is not None:
                tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year)
                self.create_table(tablename,
                                  ["pce_code", "nipa_group", "description"],
                                  ["varchar(6)", "char(7)", "text"])
                for (code, raw_desc) in self.code_dict.items():
    
                    desc = raw_desc
                    if desc.endswith('(s.)') or desc.endswith('(d.)'):
                        desc = desc[:-4].strip()
                    elif desc.endswith('(n.d.)'):
                        desc = desc[:-6].strip()
    
                    if desc in nipa_code_map:
                        desc = nipa_code_map[desc]
    
                    if desc in reverse_code_dict:
                        nipa_code = reverse_code_dict[desc]
                    else:
                        nipa_code = None
                    #self.current_stmt(code, nipa_code, raw_desc)
                    self.table.insert([code, nipa_code, raw_desc])
    
                self.code_dict = None
                self.flush()
    
        def insert_code_row(self, code, desc):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(code) is float:
                code = int(code)
    
            self.code_dict[str(code)] = desc.strip()
    
        def insert_row(self, pce_code, commod, dollar_values, factor=1):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(pce_code) is float:
                pce_code = int(pce_code)
    
            values = [str(pce_code).strip(), commod.strip()]
            for column in self.value_columns:
                if column in dollar_values:
                    if factor == 1:
                        values.append(dollar_values[column])
                    else:
                        values.append(int(float(dollar_values[column]) * factor))
                else:
                    values.append(None)
            #self.current_stmt(*values)
            self.table.insert(values)
    
        def parse_old_style_xls(self, year):
            self.set_year(year)
            self.set_filename("%d_PCE_Commodity.xls" % self.year)
            wb = xlrd.open_workbook(self.filename)
    
            # parse pce bridge data
            sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year)
            field_indexes = {}
            pce_code_idx = 0
            commod_idx = 2
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    if "PCE Category" in row:
                        pce_code_idx = row.index("PCE Category")
                        if "Commodity" in row:
                            commod_idx = row.index("Commodity")
                        for i in range(len(row)):
                            xls_col = row[i]
                            if xls_col in self.old_style_field_map:
                                colname = self.old_style_field_map[xls_col]
                                if colname not in field_indexes:
                                    field_indexes[colname] = []
                                field_indexes[colname].append(i)
                    elif len(field_indexes):
                        pce_code = row[pce_code_idx]
                        commod = str(int(row[commod_idx])).rjust(6, "0")
                        values = {}
                        for (field, columns) in field_indexes.items():
                            # doclumentation says units are in 100,000 dollars
                            # but the orders of magnitude don't match up with
                            # later years if we use 100
                            components = [int(float(row[column] * 1000))
                                          for column in columns]
                            value = 0
                            for component in components:
                                value += component
                            values[field] = value
                        self.insert_row(pce_code, commod, values)
    
            # parse codes from neighboring worksheet
            self.setup_for_codes()
            sheet = wb.sheet_by_name("%d PCE Category Descriptions" % self.year)
            code_idx = None
            desc_idx = None
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    codetab = "PCE Category Code"
                    codetab2 = "%s - %d" % (codetab, self.year)
                    if codetab in row or codetab2 in row:
                        if codetab in row:
                            code_idx = row.index(codetab)
                        else:
                            code_idx = row.index(codetab2)
                        desctab = "PCE Category Description - %d" % self.year
                        if desctab in row:
                            desc_idx = row.index(desctab)
                        else:
                            desctab = "PCE Category Description"
                            if desctab in row:
                                desc_idx = row.index(desctab)
                    elif code_idx is not None and desc_idx is not None:
                        code = row[code_idx]
                        desc = str(row[desc_idx])
                        self.insert_code_row(code, desc)
            self.flush_codes()
    
        def get_file_handle(self, filetype, options={}):
            if filetype == "txt":
                self.fh = open(self.filename)
                return self.fh
            elif filetype == "csv":
                self.fh = open(self.filename)
                if "delim" in options:
                    csvf = csv.reader(self.fh, delimiter=options["delim"])
                else:
                    csvf = csv.reader(self.fh)
                return csvf
            elif filetype == "xls":
                wb = xlrd.open_workbook(self.filename)
                return wb
    
        def parse_text(self, rowcallback):
            path = fileutils.getcache(filename, str(self.year))
            f = open(path)
            for line in f:
                rowcallback(line, this)
            f.close()
    
    tracker = IONIPAStateTracker()
    tracker.parse_old_style_xls(1967)
    tracker.parse_old_style_xls(1972)
    tracker.parse_old_style_xls(1977)
    tracker.parse_old_style_xls(1982)
    
    tracker.set_year(1987)
    tracker.set_filename("tbld-87.dat")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 103:
            continue
        commod = line[0:6]
        pce_code = line[14:18]
        values = {
            "prod_val": line[21:30],
            "rail_margin": line[30:39],
            "truck_margin": line[39:48],
            "water_margin": line[48:57],
            "air_margin": line[57:66],
            "pipe_margin": line[66:75],
            "wholesale_margin": line[75:84],
            "retail_margin": line[84:93],
            "purchase_val": line[93:102],
            }
        tracker.insert_row(pce_code, commod, values, 1000)
    
    tracker.setup_for_codes()
    tracker.set_filename("io-nipa.doc")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 27:
            continue
        code = line[0:4].strip()
        desc = line[26:].strip()
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()
    
    tracker.set_year(1992)
    tracker.set_filename("TabD.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        values = {
            "prod_val": row[4],
            "rail_margin": row[5],
            "truck_margin": row[6],
            "water_margin": row[7],
            "air_margin": row[8],
            "pipe_margin": row[9],
            "gaspipe_margin": row[10],
            "wholesale_margin": row[11],
            "retail_margin": row[12],
            "purchase_val": row[13],
            }
        tracker.insert_row(row[2], row[0], values, 1000)
    
    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        code = row[0]
        desc = row[4]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()
    
    tracker.set_year(1997)
    tracker.set_filename("AppendixC_Detail.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        values = {
            "prod_val": row[3],
            "rail_margin": row[4],
            "truck_margin": row[5],
            "water_margin": row[6],
            "air_margin": row[7],
            "pipe_margin": row[8],
            "gaspipe_margin": row[9],
            "wholesale_margin": row[10],
            "retail_margin": row[11],
            "purchase_val": row[12],
            }
        tracker.insert_row(row[1], row[0], values, 1000)
    
    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA_PCE.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        code = row[1]
        desc = row[2]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()
    
    tracker.set_year(2002)
    tracker.setup_for_codes() # do this simultaneously since it's all one file
    tracker.set_filename("2002_PCE_Bridge.xls")
    wb = tracker.get_file_handle("xls")
    naics_pat = re.compile('[A-Z0-9]{6}')
    sheet = wb.sheet_by_name("PCE_Bridge_Detail")
    pce_codes = []
    for rowindex in range(sheet.nrows):
        row = sheet.row_values(rowindex)
        if len(row) == 13 and naics_pat.match(row[1]):
            pce_desc = row[0]
            # we don't need the distinction between households and
            # nonprofit institutions service households
            parts = pce_desc.split('-')
            if len(parts) > 1:
                lastpart = parts[-1].strip()
                if lastpart == 'HH' or lastpart == 'NPISH':
                    pce_desc = '-'.join(parts[:-1])
            pce_desc = pce_desc.strip()
    
            if pce_desc in pce_codes:
                pce_code = pce_codes.index(pce_desc)
            else:
                pce_code = len(pce_codes)
                pce_codes.append(pce_desc)
                tracker.insert_code_row(str(pce_code), pce_desc)
            
            values = {
                "prod_val": row[3],
                "rail_margin": row[4],
                "truck_margin": row[5],
                "water_margin": row[6],
                "air_margin": row[7],
                "pipe_margin": row[8],
                "gaspipe_margin": row[9],
                "wholesale_margin": row[10],
                "retail_margin": row[11],
                "purchase_val": row[12],
                }
            tracker.insert_row(str(pce_code), row[1], values, 1000)
    
    tracker.flush_codes()
예제 #32
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():

    files = {
        # 2005 only has 細分類 while
        1990: "ei90187p.xls",
        1995: "ei95186p.xls",
        2000: "ei2000p104v01j.xls",
        2005: "ei2005pc403jp_wt_bd.xlsx",
    }

    def series_names_from_rows(names, units):
        # since these tables are structured identically
        # we'll just do some hard coding
        series_names = []
        for i in range(3, len(names)):
            if len(names[i]):
                name = "%s (%s)" % (names[i], units[i])
            else:
                name = None
            series_names.append(name)
        return series_names

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        tables.add_env_table(year, series_max_length=255)
        codes = tables.new_sector_codes(year, "env_ind")
        codes.curate_code_from_desc("総合計", "total")
        codes.blacklist_code("total")

        path = fileutils.getcache(filename, "jp", str(year))
        if filename.endswith("xls"):
            wb = xlrd.open_workbook(path)
            # each xls file starts with ToC listing tables A-E.
            # E1: 部門別直接エネルギー消費量,エネルギー原単位を掲載
            # E2: 部門別直接CO2排出量,CO2排出原単位を掲載
            for sheetname in ("E1", "E2"):
                sheet = wb.sheet_by_name(sheetname)
                min_series_col = 4  # first col whose values interest us
                if sheetname == "E1":
                    min_series_col = 3  # GDP - only want this once

                series_names = series_names_from_rows(sheet.row_values(0),
                                                      sheet.row_values(1))

                for i in range(2, sheet.nrows):
                    row = sheet.row_values(i)
                    code = row[1]
                    if type(code) is float:
                        code = str(int(code)).rjust(3, "0")
                    code = codes.set_code(code, row[2])
                    if code:
                        for (series, value) in zip(series_names, row[3:]):
                            if type(value) is float:
                                tables.insert_env(year, code, series, value)

        elif filename.endswith("xlsx"):
            wb = openpyxl.load_workbook(filename=path, use_iterators=True)
            # E: 部門別直接エネルギー消費量および各種GHG排出量,
            #    エネルギー原単位およびGHG原単位を掲載
            sheet = wb.get_sheet_by_name("E")
            rows = sheet.iter_rows()
            series_names = series_names_from_rows(
                [cell.internal_value for cell in next(rows)],
                [cell.internal_value for cell in next(rows)])
            for row in rows:
                code = codes.set_code(row[1].internal_value,
                                      row[2].internal_value)
                if code:
                    for (series, cell) in zip(series_names, row[3:]):
                        if cell.internal_value is not None:
                            tables.insert_env(year, code, series,
                                              cell.internal_value)

        codes.update_codes()
예제 #33
0
파일: pcebridge.py 프로젝트: sonya/eea
 def parse_text(self, rowcallback):
     path = fileutils.getcache(filename, str(self.year))
     f = open(path)
     for line in f:
         rowcallback(line, this)
     f.close()
예제 #34
0
파일: food_sector.py 프로젝트: sonya/eea
import csv

from usa import bea, config, eia, common, wiod_code_map
from common.dbconnect import db
from common import fileutils, utils, sqlhelper

old_meat_codes = ["140101", "140102", "140103", "140105"]
new_meat_codes = ["311611", "311612", "311615", "31161A"]

combined_meat_codes = old_meat_codes + new_meat_codes

for year in config.STUDY_YEARS:
    print(year)

    path = fileutils.getcache("fossil_fuel_estimates_%d.csv" % year, "usa")
    fh = open(path, "r")
    csvf = csv.reader(fh)
    io_codes = common.io_codes_for_year(year)

    data = {}
    row = next(csvf)
    for row in csvf:
        if len(row) == 6:
            sector = row[0]
            btu = row[1]  # total
            #btu = row[2] # coal
            #btu = row[3] # natural gas
            #btu = row[5] # PA-nontrans
            data[sector] = float(btu)
예제 #35
0
def doparse():
    tracker = IOTableStateTracker()
    
    #tracker.create_simple_transaction_table(
    #    "1947", "1947/1947 Transactions 85-level Data.txt")
    #tracker.create_simple_transaction_table(
    #    "1958", "1958/1958 Transactions 85-level Data.txt")
    #tracker.create_simple_transaction_table(
    #    "1963", "1963/1963 Transactions 367-level Data.txt")
    #tracker.create_simple_transaction_table(
    #    "1967", "1967/1967 Transactions 484-level Data.txt", 1000)
    
    tracker.create_simple_make_use(
        "1972", "1972/1972 Transactions 496-level Data.txt", 1000)
    tracker.create_simple_make_use(
        "1977", "1977/1977 Transactions 537-level Data.txt", 1000)

    tracker.create_make_table("1982")
    tracker.create_use_table("1982", True)
    with open(fileutils.getcache("82-6DT.DAT", "1982"), "r") as f:
        for line in f:
            if len(line) >= 112: # right-aligned
                input_ind = line[0:6]
                output_ind = line[6:12]
                use_dollars = line[12:22]
                make_dollars = line[22:32]
                tracker.insert_make(input_ind, output_ind, make_dollars, 100)
                tracker.insert_use(input_ind, output_ind, use_dollars,
                                   {"margins": line[32:42],
                                    "rail_margin": line[42:52],
                                    "truck_margin": line[52:62],
                                    "water_margin": line[62:72],
                                    "air_margin": line[72:82],
                                    "pipe_margin": line[82:92],
                                    "wholesale_margin": line[92:102],
                                    "retail_margin": line[102:112]},
                                    100) # this year dollars are in 100,000s
    
    tracker.create_make_table("1987")
    with open(fileutils.getcache("TBL1-87.DAT", "1987"), "r") as f:
        for line in f:
            if len(line) >= 24: # right-aligned
                tracker.insert_make(
                    line[0:6], line[7:13], line[15:24], 1000)
    
    tracker.create_use_table("1987", True)
    with open(fileutils.getcache("TBL2-87.DAT", "1987"), "r") as f:
        for line in f:
            if len(line) >= 96: # right-aligned
                input_ind = line[0:6]
                output_ind = line[7:13]
                use_dollars = line[15:24].strip()
                tracker.insert_use(
                    input_ind, output_ind, use_dollars,
                    {"margins": line[24:33],
                     "rail_margin": line[33:42],
                     "truck_margin": line[42:51],
                     "water_margin": line[51:60],
                     "air_margin": line[60:69],
                     "pipe_margin": line[69:78],
                     "wholesale_margin": line[78:87],
                     "retail_margin": line[87:96]},
                    1000)
    
    # the documentation for 1992 appears very incorrect unless there
    # is some way for tabs to be 7 characters for two fields and 9 
    # characters for the rest of the fields. we will just assume the
    # file is an ordinary tab-delimited file.
    
    tracker.create_make_table("1992")
    with open(fileutils.getcache("IOMAKE.TXT", "1992"), "r") as f:
        for line in f:
            row = line.split("\t")
            if len(row) == 4:
                tracker.insert_make(row[0], row[1], row[3], 1000)
    
    tracker.create_use_table("1992", True)
    with open(fileutils.getcache("IOUSE.TXT", "1992"), "r") as f:
        for line in f:
            row = line.split("\t")
            if len(row) == 13:
                tracker.insert_use(
                    row[0], row[1], row[3],
                    {"margins": row[4],
                     "rail_margin": row[5],
                     "truck_margin": row[6],
                     "water_margin": row[7],
                     "air_margin": row[8],
                     "pipe_margin": row[9],
                     "gaspipe_margin": row[10],
                     "wholesale_margin": row[11],
                     "retail_margin": line[12]},
                    1000)
    
    tracker.create_make_table("1997")
    with open(fileutils.getcache("NAICSMakeDetail.txt", "1997")) as f:
        csvf = csv.reader(f)
        for row in csvf:
            if len(row) == 4:
                tracker.insert_make(row[0], row[1], row[3], 1000)    
    
    tracker.create_use_table("1997", True)
    with open(fileutils.getcache("NAICSUseDetail.txt", "1997")) as f:
        csvf = csv.reader(f)
        for row in csvf:
            if len(row) == 15:
                tracker.insert_use(
                    row[0], row[1], row[4],
                    {"margins": row[5],
                     "rail_margin": row[6],
                     "truck_margin": row[7],
                     "water_margin": row[8],
                     "air_margin": row[9],
                     "pipe_margin": row[10],
                     "gaspipe_margin": row[11],
                     "wholesale_margin": row[12],
                     "retail_margin": row[13]},
                    1000)
    
    # contrary to the format documentation, revised 2002 tables are
    # delimited with mixed tabs and spaces. they appear fixed width with
    # 8-char tabs. field names fortunately do not contain whitespace.
    valid_line = re.compile("[A-Z0-9]{6}\s")
    
    tracker.create_make_table("2002")
    with open(fileutils.getcache("REV_NAICSMakeDetail 4-24-08.txt", "2002")) as f:
        fields = dbsetup.get_header_locations(
                     dbsetup.replace_tabs(f.readline().strip()))
        for line in f:
            if valid_line.match(line):
                row = dbsetup.get_values_for_fields(dbsetup.replace_tabs(line), fields)
                tracker.insert_make(
                    row["Industry"], row["Commodity"], row["ProVal"], 1000)
    
    tracker.create_use_table("2002", True)
    with open(fileutils.getcache("REV_NAICSUseDetail 4-24-08.txt", "2002")) as f:
        # cheat here because it's not worth the trouble to deal with
        # lack of whitespace between two fields (GasPipeVal and WhsVal)
        line = f.readline().strip().replace("GasPipeVal", "GasPipe   ")
        fields = dbsetup.get_header_locations(dbsetup.replace_tabs(line))
        for line in f:
            if valid_line.match(line):
                row = dbsetup.get_values_for_fields(
                    dbsetup.replace_tabs(line), fields)
                tracker.insert_use(
                    row["Commodity"], row["Industry"], row["ProVal"],
                    {"margins": row["StripMar"],
                     "rail_margin": row["RailVal"],
                     "truck_margin": row["TruckVal"],
                     "water_margin": row["WaterVal"],
                     "air_margin": row["AirVal"],
                     "pipe_margin": row["PipeVal"],
                     "gaspipe_margin": row["GasPipe"],
                     "wholesale_margin": row["WhsVal"],
                     "retail_margin": row["RetVal"]},
                    1000)
    
    tracker.flush()
예제 #36
0
 def get_overlay_data_location(self):
     dataname = "%s-overlay.dat" % self.filename
     return fileutils.getcache(dataname, "gnuplot")
예제 #37
0
파일: pcebridge.py 프로젝트: sonya/eea
 def set_filename(self, filename):
     path = fileutils.getcache(filename, str(self.year))
     self.filename = path
예제 #38
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    ### for ind x ind tables
    tables = {}
    colnames = ["country", "from_ind", "to_ind", "is_import", "value"]
    coltypes = ["char(3)", "varchar(15)", "varchar(15)", "bool", "float"]
    for year in config.STUDY_YEARS:
        tablename = "%s.niot_%d" % (config.WIOD_SCHEMA, year)
        tables[year] = SQLTable(tablename, colnames, coltypes)#.create()
        tables[year].drop()
        tables[year].create()
        tables[year].truncate()

    va_sectors = set(config.va_sectors.values())

    for country in config.countries.keys():
        filename = "%s_NIOT_ROW_Apr12.xlsx" % country
        subdir = os.path.join("wiod", "niot")
        path = fileutils.getcache(filename, subdir)
        wb = openpyxl.load_workbook(filename=path, use_iterators=True)
        for year in config.STUDY_YEARS:
            imports = {}

            sheet = wb.get_sheet_by_name("%d" % year)
            rows = sheet.iter_rows()
            industry_row = None
            for row in rows:
                cell = row[0]
                if cell.internal_value == "(industry-by-industry)":
                    industry_row = row
                    break
            row = next(rows) # industry names
            industry_codes = []
            for (code_cell, desc_cell) in zip(industry_row, row):
                code = code_cell.internal_value
                desc = desc_cell.internal_value
                industry_codes.append(industry_tracker.set_code(code, desc))

            for row in rows:
                from_code = None
                from_desc = None
                is_import = False
                for (to_code, value_cell) in zip(industry_codes, row):
                    column = value_cell.column
                    value = value_cell.internal_value
                    # excel columns use letters
                    if column == "A":
                        from_code = value_cell.internal_value
                    elif column == "B":
                        from_desc = value_cell.internal_value
                    elif column == "C":
                        from_code = industry_tracker.set_code(
                            from_code, from_desc)
                        if not from_code:
                            break
                        if type(value) is str and value == "Imports":
                            is_import = True
                    elif (column > "D" or len(column) > 1) \
                            and to_code and value != 0:
                        tables[year].insert(
                            [country, from_code, to_code, is_import, value])

    ### for supply and use tables
    def parse_sut(sheet_name, table_prefix):
        tables = {}
        colnames = ["country", "commodity", "industry", "value"]
        coltypes = ["char(3)", "varchar(15)", "varchar(15)", "float"]
        for year in config.STUDY_YEARS:
            tablename = "%s_%d" % (table_prefix, year)
            tables[year] = SQLTable(tablename, colnames, coltypes).create()
            tables[year].truncate()
    
        for country in config.countries.keys():
            # TODO: more automated way to get this
            if country in ("AUS", "DEU", "GBR", "USA"):
                filename = "%s_SUT_Feb12.xls" % country
            else:
                filename = "%s_SUT_Jan12.xls" % country
            subdir = os.path.join("wiod", "suts")
            path = fileutils.getcache(filename, subdir)
            wb = xlrd.open_workbook(path)
    
            # extract supply and use tables at fob prices
            sheet = wb.sheet_by_name(sheet_name)
            industry_row = sheet.row_values(0)
            row = sheet.row_values(1)
            industry_codes = []
            for (code, desc) in zip(industry_row, row):
                industry_codes.append(industry_tracker.set_code(code, desc))
    
            for i in range(2, sheet.nrows):
                row = sheet.row_values(i)
                if not len(row[0].strip()):
                    continue
                year = int(row[0])
                if year not in config.STUDY_YEARS:
                    continue
                com_code = commodity_tracker.set_code(row[1], row[2])
                if not com_code:
                    continue
                for j in range(3, len(row)):
                    value = row[j]
                    ind_code = industry_codes[j]
                    if value != 0 and ind_code:
                        # commodity first
                        tables[year].insert(
                            [country, com_code, ind_code, value])

    # make tables
    parse_sut("SUP_bas", "%s.make" % config.WIOD_SCHEMA)

    # use tables
    parse_sut("USE_bas", "%s.use" % config.WIOD_SCHEMA)
예제 #39
0
파일: pcebridge.py 프로젝트: sonya/eea
def parse_nipa_data():
    test_view = "%s.nipa_groups" % common.config.TEST_SCHEMA
    db.execute("DROP VIEW IF EXISTS %s" % test_view)

    # get table for pce category harmonization
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')

    nipa_code_map = {}
    filename = fileutils.getdatapath("nipa_code_map.csv", "usa")
    fh = open(filename)
    csvf = csv.reader(fh)
    for row in csvf:
        if len(row) == 2:
            harmonized = row[0]
            trailing = trailing_pat.match(harmonized)
            if trailing:
                harmonized = trailing.group(1)
            nipa_code_map[row[1]] = harmonized
    fh.close()

    # get nipa series codes from underlying detail tables
    tracker = TableStateTracker()
    tracker.create_table("%s.pce_codes" % config.NIPA_SCHEMA,
                         ["code", "parent", "description"],
                         ["char(7)", "char(7)", "text"], True)

    number_pat = re.compile('^\d+$')
    trailing_pat = re.compile('(.+) \(.*\d.*\)$')

    filename = fileutils.getcache("Section2All_underlying.csv", "bea", "nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False

    code_stack = [None]
    indent_stack = [-1]

    # the code mapping has been done such that each item is at least at
    # three levels of disaggregation below the top, i.e. there is always
    # an ancestor at the second level. we only want to keep track of the
    # ancestor at the third level (root is zero)
    # the first level below root has goods and services
    # the second level has durable goods, nondurable goods, and services.
    reverse_code_dict = {}
    second_level_nodes = []

    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 2.4.5U"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 2.4.5U"):
                    # we only need to go through one instance of this table
                    break
                else:
                    if number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()

                        # these are duplicate codes
                        if title.startswith("Market-based PCE"):
                            continue

                        code = row[2]
                        current_indent = len(row[1]) - len(title)

                        while current_indent <= indent_stack[-1]:
                            indent_stack.pop()
                            code_stack.pop()

                        indent_stack.append(current_indent)
                        code_stack.append(code)

                        if len(code_stack) > 1:
                            parent = code_stack[-2]
                        else:
                            parent = None

                        title = title.strip()
                        trailing = trailing_pat.match(title)
                        if trailing:
                            title = trailing.group(1)

                        if len(code_stack) > 4:
                            reverse_code_dict[title] = code_stack[3]
                        else:
                            reverse_code_dict[title] = code

                        tracker.insert_row((code, parent, title))

    tracker.flush()
    fh.close()

    # table for price deflators

    tracker.create_table("%s.implicit_price_deflators" % config.NIPA_SCHEMA,
                         ["year", "gdp", "pce"], ["int", "float", "float"])

    filename = fileutils.getcache("Section1all_csv.csv", "bea/nipa")
    fh = open(filename)
    csvf = csv.reader(fh)
    is_in_table = False

    data = {}  # we need to parse two rows before we can populate
    years = {}

    for row in csvf:
        if len(row):
            if not is_in_table:
                if row[0].startswith("Table 1.1.9"):
                    is_in_table = True
            else:
                if row[0].startswith("Table 1.1.9"):
                    # this is seasonally adjusted version of the same table
                    break
                else:
                    if row[0] == "Line":
                        for i in range(len(row)):
                            if number_pat.match(row[i]):
                                year = int(row[i])
                                years[year] = i
                                data[year] = {}

                    elif number_pat.match(row[0]) and len(row) > 2:
                        title = row[1].lstrip()
                        if title == "Gross domestic product":
                            column = "gdp"
                        elif title == "Personal consumption expenditures":
                            column = "pce"
                        else:
                            continue

                        for (year, colindex) in years.items():
                            data[year][column] = float(row[colindex])

    for (year, results) in data.items():
        tracker.insert_row([year, results["gdp"], results["pce"]])

    tracker.flush()
    fh.close()

    # parse pce bridge

    class IONIPAStateTracker(TableStateTracker):
        def flush(self):
            TableStateTracker.flush(self)
            if self.fh is not None and not self.fh.closed:
                self.fh.close()

        def __init__(self):
            TableStateTracker.__init__(self)
            self.fh = None
            self.code_dict = None

            self.value_columns = [
                "prod_val", "rail_margin", "truck_margin", "water_margin",
                "air_margin", "pipe_margin", "gaspipe_margin",
                "wholesale_margin", "retail_margin", "purchase_val"
            ]

            self.old_style_field_map = {
                "Producers' Value": "prod_val",
                "MfgExciseTax": "prod_val",
                "RailMargin": "rail_margin",
                "TruckMargin": "truck_margin",
                "WaterMargin": "water_margin",
                "AirMargin": "air_margin",
                "PipeMargin": "pipe_margin",
                "WholesaleMargin": "wholesale_margin",
                "WholesaleTax": "wholesale_margin",
                "RetailMargin": "retail_margin",
                "RetailSalesTax": "retail_margin",
                "OtherRetailTax": "retail_margin",
                "Purchasers' Value": "purchase_val",
            }

        def set_filename(self, filename):
            path = fileutils.getcache(filename, str(self.year))
            self.filename = path

        def set_year(self, year):
            self.flush()
            self.year = year
            tablename = "%s.pcebridge_%d" % (config.IO_SCHEMA, year)
            fields = ["pce_code", "commodity"] + self.value_columns
            types = ["varchar(6)", "varchar(6)"] + \
                ["bigint"]*len(self.value_columns)
            self.create_table(tablename, fields, types)

        def setup_for_codes(self):
            self.code_dict = {}

        def flush_codes(self):
            if self.code_dict is not None:
                tablename = "%s.nipa_codes_%d" % (config.IO_SCHEMA, self.year)
                self.create_table(tablename,
                                  ["pce_code", "nipa_group", "description"],
                                  ["varchar(6)", "char(7)", "text"])
                for (code, raw_desc) in self.code_dict.items():

                    desc = raw_desc
                    if desc.endswith('(s.)') or desc.endswith('(d.)'):
                        desc = desc[:-4].strip()
                    elif desc.endswith('(n.d.)'):
                        desc = desc[:-6].strip()

                    if desc in nipa_code_map:
                        desc = nipa_code_map[desc]

                    if desc in reverse_code_dict:
                        nipa_code = reverse_code_dict[desc]
                    else:
                        nipa_code = None
                    #self.current_stmt(code, nipa_code, raw_desc)
                    self.table.insert([code, nipa_code, raw_desc])

                self.code_dict = None
                self.flush()

        def insert_code_row(self, code, desc):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(code) is float:
                code = int(code)

            self.code_dict[str(code)] = desc.strip()

        def insert_row(self, pce_code, commod, dollar_values, factor=1):
            # workaround for the way excel interprets numbers as floats
            # when we know the codes should be strings
            if type(pce_code) is float:
                pce_code = int(pce_code)

            values = [str(pce_code).strip(), commod.strip()]
            for column in self.value_columns:
                if column in dollar_values:
                    if factor == 1:
                        values.append(dollar_values[column])
                    else:
                        values.append(
                            int(float(dollar_values[column]) * factor))
                else:
                    values.append(None)
            #self.current_stmt(*values)
            self.table.insert(values)

        def parse_old_style_xls(self, year):
            self.set_year(year)
            self.set_filename("%d_PCE_Commodity.xls" % self.year)
            wb = xlrd.open_workbook(self.filename)

            # parse pce bridge data
            sheet = wb.sheet_by_name("%d PCE Workfile - Commodity" % self.year)
            field_indexes = {}
            pce_code_idx = 0
            commod_idx = 2
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    if "PCE Category" in row:
                        pce_code_idx = row.index("PCE Category")
                        if "Commodity" in row:
                            commod_idx = row.index("Commodity")
                        for i in range(len(row)):
                            xls_col = row[i]
                            if xls_col in self.old_style_field_map:
                                colname = self.old_style_field_map[xls_col]
                                if colname not in field_indexes:
                                    field_indexes[colname] = []
                                field_indexes[colname].append(i)
                    elif len(field_indexes):
                        pce_code = row[pce_code_idx]
                        commod = str(int(row[commod_idx])).rjust(6, "0")
                        values = {}
                        for (field, columns) in field_indexes.items():
                            # doclumentation says units are in 100,000 dollars
                            # but the orders of magnitude don't match up with
                            # later years if we use 100
                            components = [
                                int(float(row[column] * 1000))
                                for column in columns
                            ]
                            value = 0
                            for component in components:
                                value += component
                            values[field] = value
                        self.insert_row(pce_code, commod, values)

            # parse codes from neighboring worksheet
            self.setup_for_codes()
            sheet = wb.sheet_by_name("%d PCE Category Descriptions" %
                                     self.year)
            code_idx = None
            desc_idx = None
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1:
                    codetab = "PCE Category Code"
                    codetab2 = "%s - %d" % (codetab, self.year)
                    if codetab in row or codetab2 in row:
                        if codetab in row:
                            code_idx = row.index(codetab)
                        else:
                            code_idx = row.index(codetab2)
                        desctab = "PCE Category Description - %d" % self.year
                        if desctab in row:
                            desc_idx = row.index(desctab)
                        else:
                            desctab = "PCE Category Description"
                            if desctab in row:
                                desc_idx = row.index(desctab)
                    elif code_idx is not None and desc_idx is not None:
                        code = row[code_idx]
                        desc = str(row[desc_idx])
                        self.insert_code_row(code, desc)
            self.flush_codes()

        def get_file_handle(self, filetype, options={}):
            if filetype == "txt":
                self.fh = open(self.filename)
                return self.fh
            elif filetype == "csv":
                self.fh = open(self.filename)
                if "delim" in options:
                    csvf = csv.reader(self.fh, delimiter=options["delim"])
                else:
                    csvf = csv.reader(self.fh)
                return csvf
            elif filetype == "xls":
                wb = xlrd.open_workbook(self.filename)
                return wb

        def parse_text(self, rowcallback):
            path = fileutils.getcache(filename, str(self.year))
            f = open(path)
            for line in f:
                rowcallback(line, this)
            f.close()

    tracker = IONIPAStateTracker()
    tracker.parse_old_style_xls(1967)
    tracker.parse_old_style_xls(1972)
    tracker.parse_old_style_xls(1977)
    tracker.parse_old_style_xls(1982)

    tracker.set_year(1987)
    tracker.set_filename("tbld-87.dat")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 103:
            continue
        commod = line[0:6]
        pce_code = line[14:18]
        values = {
            "prod_val": line[21:30],
            "rail_margin": line[30:39],
            "truck_margin": line[39:48],
            "water_margin": line[48:57],
            "air_margin": line[57:66],
            "pipe_margin": line[66:75],
            "wholesale_margin": line[75:84],
            "retail_margin": line[84:93],
            "purchase_val": line[93:102],
        }
        tracker.insert_row(pce_code, commod, values, 1000)

    tracker.setup_for_codes()
    tracker.set_filename("io-nipa.doc")
    fh = tracker.get_file_handle("txt")
    for line in fh:
        if len(line) < 27:
            continue
        code = line[0:4].strip()
        desc = line[26:].strip()
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()

    tracker.set_year(1992)
    tracker.set_filename("TabD.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        values = {
            "prod_val": row[4],
            "rail_margin": row[5],
            "truck_margin": row[6],
            "water_margin": row[7],
            "air_margin": row[8],
            "pipe_margin": row[9],
            "gaspipe_margin": row[10],
            "wholesale_margin": row[11],
            "retail_margin": row[12],
            "purchase_val": row[13],
        }
        tracker.insert_row(row[2], row[0], values, 1000)

    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA.txt")
    fh = tracker.get_file_handle("csv", {"delim": "\t"})
    for row in fh:
        code = row[0]
        desc = row[4]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()

    tracker.set_year(1997)
    tracker.set_filename("AppendixC_Detail.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        values = {
            "prod_val": row[3],
            "rail_margin": row[4],
            "truck_margin": row[5],
            "water_margin": row[6],
            "air_margin": row[7],
            "pipe_margin": row[8],
            "gaspipe_margin": row[9],
            "wholesale_margin": row[10],
            "retail_margin": row[11],
            "purchase_val": row[12],
        }
        tracker.insert_row(row[1], row[0], values, 1000)

    tracker.setup_for_codes()
    tracker.set_filename("IO-NIPA_PCE.txt")
    fh = tracker.get_file_handle("csv", {"delim": ","})
    for row in fh:
        code = row[1]
        desc = row[2]
        tracker.insert_code_row(code, desc)
    tracker.flush_codes()

    tracker.set_year(2002)
    tracker.setup_for_codes()  # do this simultaneously since it's all one file
    tracker.set_filename("2002_PCE_Bridge.xls")
    wb = tracker.get_file_handle("xls")
    naics_pat = re.compile('[A-Z0-9]{6}')
    sheet = wb.sheet_by_name("PCE_Bridge_Detail")
    pce_codes = []
    for rowindex in range(sheet.nrows):
        row = sheet.row_values(rowindex)
        if len(row) == 13 and naics_pat.match(row[1]):
            pce_desc = row[0]
            # we don't need the distinction between households and
            # nonprofit institutions service households
            parts = pce_desc.split('-')
            if len(parts) > 1:
                lastpart = parts[-1].strip()
                if lastpart == 'HH' or lastpart == 'NPISH':
                    pce_desc = '-'.join(parts[:-1])
            pce_desc = pce_desc.strip()

            if pce_desc in pce_codes:
                pce_code = pce_codes.index(pce_desc)
            else:
                pce_code = len(pce_codes)
                pce_codes.append(pce_desc)
                tracker.insert_code_row(str(pce_code), pce_desc)

            values = {
                "prod_val": row[3],
                "rail_margin": row[4],
                "truck_margin": row[5],
                "water_margin": row[6],
                "air_margin": row[7],
                "pipe_margin": row[8],
                "gaspipe_margin": row[9],
                "wholesale_margin": row[10],
                "retail_margin": row[11],
                "purchase_val": row[12],
            }
            tracker.insert_row(str(pce_code), row[1], values, 1000)

    tracker.flush_codes()
예제 #40
0
파일: census_intl.py 프로젝트: sonya/eea
def doparse():

    # ppp rank from
    # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html
    countries = {
        "LUX": {"fips": "LU", "ppp": 3},
        "USA": {"fips": "US", "ppp": 11},
        "NLD": {"fips": "NL", "ppp": 17},
        "AUT": {"fips": "AU", "ppp": 18},
        "SWE": {"fips": "SW", "ppp": 21},
        "CAN": {"fips": "CA", "ppp": 20},
        "AUS": {"fips": "AS", "ppp": 22},
        "IRL": {"fips": "EI", "ppp": 23},
        "DEU": {"fips": "GM", "ppp": 26},
        "TWN": {"fips": "TW", "ppp": 27},
        "BEL": {"fips": "BE", "ppp": 28},
        "DNK": {"fips": "DK", "ppp": 29},
        "FIN": {"fips": "FI", "ppp": 32},
        "GBR": {"fips": "UK", "ppp": 33},
        "FRA": {"fips": "FR", "ppp": 35},
        "JPN": {"fips": "JA", "ppp": 36},
        "KOR": {"fips": "KS", "ppp": 40},
        "ESP": {"fips": "SP", "ppp": 43},
        "ITA": {"fips": "IT", "ppp": 44},
        "CYP": {"fips": "CY", "ppp": 46},
        "SVN": {"fips": "SI", "ppp": 47},
        "CZE": {"fips": "EZ", "ppp": 50}, # EZ??
        "GRC": {"fips": "GR", "ppp": 52},
        "MLT": {"fips": "MT", "ppp": 53},
        "PRT": {"fips": "PO", "ppp": 57},
        "SVK": {"fips": "LO", "ppp": 58},
        "POL": {"fips": "PL", "ppp": 60},
        "EST": {"fips": "EN", "ppp": 61},
        "HUN": {"fips": "HU", "ppp": 63},
        "LTU": {"fips": "LH", "ppp": 65},
        "RUS": {"fips": "RS", "ppp": 71},
        "LVA": {"fips": "LG", "ppp": 75},
        "MEX": {"fips": "MX", "ppp": 85},
        "TUR": {"fips": "TU", "ppp": 86},
        "BRA": {"fips": "BR", "ppp": 92},
        "ROU": {"fips": "RO", "ppp": 97},
        "BGR": {"fips": "BU", "ppp": 101},
        "CHN": {"fips": "CH", "ppp": 121},
        "IDN": {"fips": "ID", "ppp": 156},
        "IND": {"fips": "IN", "ppp": 164},
        }
    
    tablename = "world_supplement"
    table = SQLTable(tablename,
                     ["year", "country", "pop", "gdp", "ppp"],
                     ["int", "char(3)", "int", "float", "float"]).create()
    table.truncate()
    
    country_fips = {}
    data = {}
    for (country, info) in countries.items():
        data[country] = {}
        country_fips[info["fips"]] = country
    
    # this file spec is documented in the xlsx file from the archive
    thisyear = datetime.datetime.now().year
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    if year >= thisyear: # we don't want future projections
                        continue
                    country = country_fips[fips]
                    data[country][year] = {"pop": int(fields[2])}
    
    worldbank = {
        "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls",
        "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        }
    
    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    # this discards years where we don't have population
                    if year in data[country] and \
                            type(value) is float and value != 0:
                        data[country][year][indicator] = value
    
    for (country, country_data) in data.items():
        for (year, year_data) in country_data.items():
            ppp = None
            gdp = None
            pop = year_data["pop"]
            if "gdp" in year_data:
                gdp = year_data["gdp"]
            if "ppp" in year_data:
                ppp = year_data["ppp"]

            table.insert([year, country, pop, gdp, ppp])
예제 #41
0
파일: pcebridge.py 프로젝트: sonya/eea
 def parse_text(self, rowcallback):
     path = fileutils.getcache(filename, str(self.year))
     f = open(path)
     for line in f:
         rowcallback(line, this)
     f.close()
예제 #42
0
파일: bts.py 프로젝트: sonya/eea
def doparse():
    carrier_countries = {
        #"-": "", # Unknown
        "1I": "USA", # Sky Trek International Airlines
        "2T": "CAN", # Canada 3000 Airlines Ltd.
        "3Z": "USA", # Tatonduk Outfitters Limited d/b/a Everts Air Alaska and Everts Air Cargo
        "5X": "USA", # United Parcel Service
        "5Y": "USA", # Atlas Air Inc.
        "6F": "GBR", # Laker Airways Inc.
        #"6U": "", # Air Ukraine
        #"6Y": "", # Nicaraguense De Aviacion Sa
        #"7P": "", # Apa International Air S.A. (dominican rep)
        #"7Z": "", # Lb Limited
        "8C": "USA", # Air Transport International
        "AA": "USA", # American Airlines Inc.
        "AC": "CAN", # Air Canada
        #"ADB": "", # Antonov Company (ukraine)
        "AF": "FRA", # Compagnie Nat'l Air France
        "AI": "IND", # National Aviation Company of India Limited d/b/a Air India
        "AM": "MEX", # Aeromexico
        #"AQQ": "", # Air Charter (Safa)
        #"AR": "", # Aerolineas Argentinas
        "AS": "USA", # Alaska Airlines Inc.
        #"AT": "", # Royal Air Maroc (morocco)
        #"AV": "", # Aerovias Nac'l De Colombia
        "AY": "FIN", # Finnair Oy
        "AZ": "ITA", # Compagnia Aerea Italiana
        #"All Rows": "", # All Rows (including those not displayed)
        "BA": "GBR", # British Airways Plc
        #"BBQ": "", # Balair Ag (swiss)
        "BCQ": "CAN", # Bradley Air Services Ltd.
        #"BG": "", # Biman Bangladesh Airlines
        "BQ": "MEX", # Aeromar C. Por A.
        "BR": "TWN", # Eva Airways Corporation
        #"BW": "", # Caribbean Airlines Limited (trinidad and tobago)
        "BY": "GBR", # Britannia Airways Ltd.
        "CA": "CHN", # Air China
        #"CC": "", # Air Atlanta Icelandic
        "CDQ": "USA", # Kitty Hawk International
        #"CF": "", # Compan. De Aviacion Faucett (peru)
        "CI": "TWN", # China Airlines Ltd.
        #"CLQ": "", # Aero Transcolombiana
        #"CM": "", # Compania Panamena (Copa)
        "CO": "USA", # Continental Air Lines Inc.
        "CP (1)": "CAN", # Canadian Airlines International Ltd.
        "CS": "USA", # Continental Micronesia
        "CV": "LUX", # Cargolux Airlines International S.A
        #"CVQ": "", # Caraven S.A.
        #"CX": "", # Cathay Pacific Airways Ltd. (hong kong, includes pre 1997)
        "CYQ": "FRA", # Corse Air International (assuming corsair)
        "CZ": "CHN", # China Southern Airlines
        "DE": "DEU", # Condor Flugdienst
        "DHQ": "GBR", # DHL Aero Expresso
        "DL": "USA", # Delta Air Lines Inc.
        #"ED": "", # Andes (ecuador or argentina)
        "EH": "ESP", # Saeta Airlines
        "EI": "IRL", # Aer Lingus Plc
        #"EOQ": "", # Aeroservicios Ecuatorianos
        "ER": "USA", # Astar USA, LLC
        #"EU": "", # Ecuatoriana De Aviacion
        #"EXQ": "", # Export Air Del Peru S.A.
        "EZ": "TWN", # Evergreen International Inc.
        "F9": "USA", # Frontier Airlines Inc.
        "FCQ": "USA", # Falcon Air Express
        #"FF": "", # Tower Air Inc.
        #"FI": "", # Icelandair
        #"FJ": "", # Air Pacific Ltd. (fiji)
        "FNQ": "USA", # Fine Airlines Inc.
        #"FQ": "", # Air Aruba
        #"FS": "", # Serv De Trans Aereos Fuegui (argentina)
        "FX": "USA", # Federal Express Corporation
        #"G3": "", # Aerochago S.A.
        "GA": "IDN", # P.T. Garuda Indonesian Arwy
        "GD": "MEX", # Transp. Aereos Ejecutivos
        #"GF": "", # Gulf Air Company (bahrain)
        #"GH": "", # Ghana Airways Corporation
        "GJ (1)": "MEX", # Mexicargo
        "GL": "USA", # Miami Air International
        "GR": "USA", # Gemini Air Cargo Airways
        #"GU": "", # Aviateca (guatemala)
        #"GY": "", # Guyana Airways Corporation
        "H2": "BEL", # City Bird
        "H5": "RUS", # Magadan Airlines
        "HA": "USA", # Hawaiian Airlines Inc.
        "HAQ": "DEU", # Hapag Lloyd Flug.
        "HCQ": "USA", # Av Atlantic
        #"HFQ": "", # Haiti Air Freight Intl
        "HLQ": "AUS", # Heavylift Cargo Airlines Lt
        "HP": "USA", # America West Airlines Inc. (Merged with US Airways 9/05. Stopped reporting 10/07.)
        #"HY": "", # Uzbekistan Airways
        "IB": "ESP", # Iberia Air Lines Of Spain
        #"ITQ": "", # Interamericana De Aviacion (uruguay)
        "IW": "FRA", # Air Liberte Aka Aom Minerve
        #"JAQ": "", # Jamaica Air Freighters
        "JD": "JPN", # Japan Air System Co. Ltd.
        "JI (1)": "USA", # Midway Airlines Inc.
        "JK": "ESP", # Spanair S.A.
        "JKQ": "USA", # Express One International Inc.
        "JL": "JPN", # Japan Air Lines Co. Ltd.
        #"JM": "", # Air Jamaica Limited
        "JR": "USA", # Aero California
        "JW": "CAN", # Arrow Air Inc.
        "JZ": "JPN", # Japan Air Charter Co. Ltd.
        "K8 (1)": "NLD", # Dutch Caribbean Airlines
        "KE": "KOR", # Korean Air Lines Co. Ltd.
        "KH": "USA", # Aloha Air Cargo
        #"KI": "", # Time Air Ltd. (south africa)
        "KL": "NLD", # Klm Royal Dutch Airlines
        #"KP": "", # Kiwi International
        "KR": "USA", # Kitty Hawk Aircargo
        "KTQ": "TUR", # Turks Air Ltd.
        #"KU": "", # Kuwait Airways Corp.
        "KW": "USA", # Carnival Air Lines Inc.
        #"KX": "", # Cayman Airways Limited
        "KZ": "JPN", # Nippon Cargo Airlines
        #"LA": "", # Lan-Chile Airlines
        #"LB": "", # Lloyd Aereo Boliviano S. A.
        "LGQ": "MEX", # Lineas Aereas Allegro
        "LH": "DEU", # Lufthansa German Airlines
        "LO": "POL", # Polskie Linie Lotnicze
        #"LR": "", # Lacsa (costa rica)
        #"LSQ": "", # Lineas Aereas Suramerican (colombia)
        "LT": "DEU", # Luftransport-Unternehmen
        #"LU": "", # Air Atlantic Dominicana
        #"LY": "", # El Al Israel Airlines Ltd.
        "LZ": "BGR", # Balkan Bulgarian Airlines
        "M6": "USA", # Amerijet International
        "M7": "MEX", # Aerotransportes Mas De Crga
        "MA": "HUN", # Malev Hungarian Airlines
        "MG": "USA", # Champion Air
        #"MH": "", # Malaysian Airline System
        #"ML": "", # Aero Costa Rica
        "MP": "NLD", # Martinair Holland N.V.
        #"MS": "", # Egyptair
        "MT": "GBR", # Thomas Cook Airlines Uk Ltd.
        "MT (1)": "GBR", # Flying Colours Airlines Ltd.
        "MU": "CHN", # China Eastern Airlines
        #"MUQ": "", # Aerolineas Mundo (columbia)
        "MX": "MEX", # Compania Mexicana De Aviaci
        #"MYQ": "", # Lineas Aereas Mayas (Lamsa)
        #"N5 (1)": "", # Nations Air Express Inc.
        "NA": "USA", # North American Airlines
        "NG": "DEU", # Lauda Air Luftfahrt Ag
        "NH": "JPN", # All Nippon Airways Co.
        "NK": "USA", # Spirit Air Lines
        "NW": "USA", # Northwest Airlines Inc.
        "NWQ": "USA", # N. W. Territorial Airways
        #"NZ": "", # Air New Zealand
        "OA": "GRC", # Olympic Airways
        #"OI": "", # Prestige Airways (uae)
        "OK": "CZE", # Czech Airlines
        #"ON": "", # Air Nauru
        "OS": "AUT", # Austrian Airlines
        "OW": "USA", # Executive Airlines
        "OZ": "KOR", # Asiana Airlines Inc.
        "PA (2)": "USA", # Pan American World Airways
        "PCQ": "USA", # Pace Airlines
        #"PIQ": "", # Pacific International Airlines (ambiguous: usa, panama)
        #"PK": "", # Pakistan International Airlines
        #"PL": "", # Aero Peru
        "PNQ": "USA", # Panagra Airways
        "PO": "USA", # Polar Air Cargo Airways
        #"PR": "", # Philippine Airlines Inc.
        "PRQ": "USA", # Florida West Airlines Inc.
        "PT": "USA", # Capital Cargo International
        #"PY": "", # Surinam Airways Limited
        "Q7": "BEL", # Sobelair
        "QF": "AUS", # Qantas Airways Ltd.
        "QK": "CAN", # Jazz Aviation LP
        #"QN": "", # Royal Air (ambiguous)
        "QO": "MEX", # Aeromexpress
        "QQ": "USA", # Reno Air Inc.
        #"QT": "", # Transportes Aereos Mercantiles Panamericanos S.A (colombia)
        "QTQ": "IRL", # Aer Turas Teoranta
        "QX": "USA", # Horizon Air
        "RD": "USA", # Ryan International Airlines
        "REQ": "USA", # Renown Aviation
        "RG": "BRA", # Varig S. A.
        #"RJ": "", # Alia-(The) Royal Jordanian
        #"RK": "", # Air Afrique
        "RNQ": "GBR", # Mytravel Airways
        "RO": "ROU", # Tarom Romanian Air Transpor
        #"SA": "", # South African Airways
        "SAQ": "USA", # Southern Air Transport Inc.
        "SEQ": "GBR", # Sky Service F.B.O.
        "SIQ": "LUX", # Premiair
        "SK": "SWE", # Scandinavian Airlines Sys.
        "SM": "USA", # Sunworld International Airlines
        "SN (1)": "BEL", # Sabena Belgian World Air.
        "SPQ": "USA", # Sun Pacific International
        #"SQ": "", # Singapore Airlines Ltd.
        #"SR": "", # Swissair Transport Co. Ltd.
        "SU": "RUS", # Aeroflot Russian Airlines
        #"SV": "", # Saudi Arabian Airlines Corp
        "SX (1)": "MEX", # Aeroejecutivo S.A.
        "SY": "USA", # Sun Country Airlines d/b/a MN Airlines
        "T9": "USA", # TransMeridian Airlines
        #"TA": "", # Taca International Airlines (el savador)
        "TCQ": "USA", # Express.Net Airlines
        #"TG": "", # Thai Airways International Ltd.
        "TK": "TUR", # Turk Hava Yollari A.O.
        "TKQ": "USA", # Trans-Air-Link Corporation
        "TNQ": "USA", # Emery Worldwide Airlines
        "TP": "PRT", # Tap-Portuguese Airlines
        "TR": "BRA", # Transbrasil S.A.
        "TRQ": "SWE", # Blue Scandinavia Ab
        "TS": "CAN", # Air Transat
        "TW": "USA", # Trans World Airways LLC
        #"TZ": "", # ATA Airlines d/b/a ATA (iran)
        "TZQ": "GBR", # First Choice Airways
        "U7": "USA", # USA Jet Airlines Inc.
        "UA": "USA", # United Air Lines Inc.
        #"UD": "", # Fast Air Carrier Ltd.
        "UN": "RUS", # Transaero Airlines
        #"UP": "", # Bahamasair Holding Limited
        "US": "USA", # US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.)
        "UX": "ESP", # Air Europa
        #"UYQ": "", # Aerolineas Uruguayas S.A.
        #"VA (1)": "", # Venezuelan International Airways
        #"VC": "", # Servicios Avensa (venezuela)
        #"VE": "", # Aerovias Venezolanas-Avensa
        "VIQ": "RUS", # Volga-Dnepr Airlines
        "VP": "BRA", # Viacao Aerea Sao Paulo
        #"VR": "", # Transportes Aereos De Cabo (cape verde)
        "VS": "GBR", # Virgin Atlantic Airways
        #"VX (1)": "", # Aces Airlines (colombia)
        #"W7": "", # Western Pacific Airlines (solomon islands)
        #"WD": "", # Halisa Air (haiti)
        "WE": "USA", # Centurion Cargo Inc.
        "WO": "USA", # World Airways Inc.
        #"XC": "", # Air Caribbean (1)
        "XE": "USA", # ExpressJet Airlines Inc. (1)
        "XJ": "USA", # Mesaba Airlines
        "XP": "USA", # Casino Express
        "YX (1)": "USA", # Midwest Airline, Inc.
        "ZB": "USA", # Monarch Airlines
        #"ZUQ": "", # Zuliana De Aviacion (venezuela)
        "ZX (1)": "CAN", # Airbc Ltd.
        }

    tablename = "air_carriers"
    table = SQLTable(
        tablename,
        ["year", "carrier", "series", "value"],
        ["int", "varchar(15)", "varchar(15)", "int"])
    table.create()
    table.truncate()

    carriers = {}

    for year in config.STUDY_YEARS:
        for filestem in ["freight", "passengers"]:
            filename = filestem + str(year) + ".csv"
            path = fileutils.getcache(filename, "bts")
            with open(path) as fh:
                csvf = csv.reader(fh)
                next(csvf)
                header = next(csvf)
                for row in csvf:
                    if len(row) == 3:
                        carrier = row[0]
                        #carrier_name = row[1]
                        if carrier in carrier_countries:
                            country = carrier_countries[carrier]
                            value = int(row[2])
                            table.insert([year, country, filestem, value])
예제 #43
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    # choose 中分類 for all io tables.
    # 中分類 for 1990 and 1995 don't break down the electronic
    # sectors as far as i would like, so use 小分類
    files = {
        1990: "l00_21.xls",
        1995: "l00_21.xls",
        2000: "io00a301.xls",
        2005: "io05a301.xls",
        }

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        # 1995 and 2000 io tables: easiest
        tables.add_io_table(year)
        codes = tables.new_sector_codes(year)

        # for 1995 use the heisei 2-7-12 file since it has more
        # harmonized sectors than the standalone 1995 file
        if year == 1995:
            sheetindex = 2
        else:
            # the first page of the heisei 2-7-12 file (used for 1990)
            # happens to be 1990 at nominal prices, matching the others
            sheetindex = 0

        path = fileutils.getcache(filename, "jp", str(year))
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(sheetindex)
        ind_names = None
        ind_codes = None
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if ind_codes is None:
                for cell in row:
                    if cell == 1:
                        ind_codes = [str(c).strip().rjust(3, "0")
                                      for c in row]
                        break
                    if cell.strip() == "001":
                        ind_codes = row
                        break
            elif ind_names is None:
                ind_names = row
                temp_codes = [None, None]
                for i in range(2, len(row)):
                    temp_codes.append(
                        codes.set_code(ind_codes[i], row[i]))
                ind_codes = temp_codes
            else:
                from_code = row[0]
                if type(from_code) is float:
                    from_code = str(int(from_code)).rjust(3, "0")
                from_code = codes.set_code(from_code, row[1])
                if from_code:
                    for i in range(2, len(row)):
                        to_code = ind_codes[i]
                        value = row[i]
                        tables.insert_io(year, from_code, to_code, value)
 
        codes.update_codes()
예제 #44
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue

        db_table = SQLTable(
            "cn.emissions_%d" % year,
            ["industry_zh", "industry_en", "pollutant", "amount"],
            ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()

        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]

            for (pollutant, amount) in zip(columns[max_sector_column + 1:],
                                           rowdata[max_sector_column + 1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])

        xact = db.xact(mode="READ WRITE")
        xact.begin()

        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb")  # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)

            print(adir, filename)
            title = soup.title.string

            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break

            columns = None
            did_have_numbers = False  # true after we've parsed through
            max_sector_column = 0  # 1 if english separate, 0 otherwise

            prev_rowdata = None
            prev_rowspans = None
            data = []

            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None

            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue

                rowspans = []
                rowdata = []

                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None

                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)

                for cellpos in range(rowlen):
                    cell = cells[cellpos]

                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])

                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')

                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]

                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(
                                -abs(abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)

                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)

                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan:  # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)

                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()

                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans

                if len(rowdata) == 0:
                    continue

                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue

                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0:  # all blank strings
                    continue

                # if we're sure we have columns, clean up rowdata so
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1] * len(rowspans)

                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break

                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i - 1] + " - "

                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]

                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])

                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None

                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")

                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1

                elif insert_now is not None and len(insert_now) == len(
                        columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)

            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)

            print(columns)

        xact.commit()
예제 #45
0
파일: parser.py 프로젝트: sonya/eea
def parse_env():

    files = {
        # 2005 only has 細分類 while
        1990: "ei90187p.xls",
        1995: "ei95186p.xls",
        2000: "ei2000p104v01j.xls",
        2005: "ei2005pc403jp_wt_bd.xlsx",
        }

    def series_names_from_rows(names, units):
        # since these tables are structured identically
        # we'll just do some hard coding
        series_names = []
        for i in range(3, len(names)):
            if len(names[i]):
                name = "%s (%s)" % (names[i], units[i])
            else:
                name = None
            series_names.append(name)
        return series_names

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        tables.add_env_table(year, series_max_length=255)
        codes = tables.new_sector_codes(year, "env_ind")
        codes.curate_code_from_desc("総合計", "total")
        codes.blacklist_code("total")

        path = fileutils.getcache(filename, "jp", str(year))
        if filename.endswith("xls"):
            wb = xlrd.open_workbook(path)
            # each xls file starts with ToC listing tables A-E.
            # E1: 部門別直接エネルギー消費量,エネルギー原単位を掲載
            # E2: 部門別直接CO2排出量,CO2排出原単位を掲載
            for sheetname in ("E1", "E2"):
                sheet = wb.sheet_by_name(sheetname)
                min_series_col = 4 # first col whose values interest us
                if sheetname == "E1":
                    min_series_col = 3 # GDP - only want this once
    
                series_names = series_names_from_rows(
                    sheet.row_values(0),
                    sheet.row_values(1))

                for i in range(2, sheet.nrows):
                    row = sheet.row_values(i)
                    code = row[1]
                    if type(code) is float:
                        code = str(int(code)).rjust(3, "0")
                    code = codes.set_code(code, row[2])
                    if code:
                        for (series, value) in zip(series_names, row[3:]):
                            if type(value) is float:
                                tables.insert_env(year, code, series, value)
    
        elif filename.endswith("xlsx"):
            wb = openpyxl.load_workbook(filename=path, use_iterators=True)
            # E: 部門別直接エネルギー消費量および各種GHG排出量,
            #    エネルギー原単位およびGHG原単位を掲載
            sheet = wb.get_sheet_by_name("E")
            rows = sheet.iter_rows()
            series_names = series_names_from_rows(
                [cell.internal_value for cell in next(rows)],
                [cell.internal_value for cell in next(rows)])
            for row in rows:
                code = codes.set_code(row[1].internal_value,
                                      row[2].internal_value)
                if code:
                    for (series, cell) in zip(series_names, row[3:]):
                        if cell.internal_value is not None:
                            tables.insert_env(year, code, series,
                                              cell.internal_value)

        codes.update_codes()
예제 #46
0
def doparse():

    # ppp rank from
    # https://www.cia.gov/library/publications/the-world-factbook/rankorder/2004rank.html
    countries = {
        "LUX": {
            "fips": "LU",
            "ppp": 3
        },
        "USA": {
            "fips": "US",
            "ppp": 11
        },
        "NLD": {
            "fips": "NL",
            "ppp": 17
        },
        "AUT": {
            "fips": "AU",
            "ppp": 18
        },
        "SWE": {
            "fips": "SW",
            "ppp": 21
        },
        "CAN": {
            "fips": "CA",
            "ppp": 20
        },
        "AUS": {
            "fips": "AS",
            "ppp": 22
        },
        "IRL": {
            "fips": "EI",
            "ppp": 23
        },
        "DEU": {
            "fips": "GM",
            "ppp": 26
        },
        "TWN": {
            "fips": "TW",
            "ppp": 27
        },
        "BEL": {
            "fips": "BE",
            "ppp": 28
        },
        "DNK": {
            "fips": "DK",
            "ppp": 29
        },
        "FIN": {
            "fips": "FI",
            "ppp": 32
        },
        "GBR": {
            "fips": "UK",
            "ppp": 33
        },
        "FRA": {
            "fips": "FR",
            "ppp": 35
        },
        "JPN": {
            "fips": "JA",
            "ppp": 36
        },
        "KOR": {
            "fips": "KS",
            "ppp": 40
        },
        "ESP": {
            "fips": "SP",
            "ppp": 43
        },
        "ITA": {
            "fips": "IT",
            "ppp": 44
        },
        "CYP": {
            "fips": "CY",
            "ppp": 46
        },
        "SVN": {
            "fips": "SI",
            "ppp": 47
        },
        "CZE": {
            "fips": "EZ",
            "ppp": 50
        },  # EZ??
        "GRC": {
            "fips": "GR",
            "ppp": 52
        },
        "MLT": {
            "fips": "MT",
            "ppp": 53
        },
        "PRT": {
            "fips": "PO",
            "ppp": 57
        },
        "SVK": {
            "fips": "LO",
            "ppp": 58
        },
        "POL": {
            "fips": "PL",
            "ppp": 60
        },
        "EST": {
            "fips": "EN",
            "ppp": 61
        },
        "HUN": {
            "fips": "HU",
            "ppp": 63
        },
        "LTU": {
            "fips": "LH",
            "ppp": 65
        },
        "RUS": {
            "fips": "RS",
            "ppp": 71
        },
        "LVA": {
            "fips": "LG",
            "ppp": 75
        },
        "MEX": {
            "fips": "MX",
            "ppp": 85
        },
        "TUR": {
            "fips": "TU",
            "ppp": 86
        },
        "BRA": {
            "fips": "BR",
            "ppp": 92
        },
        "ROU": {
            "fips": "RO",
            "ppp": 97
        },
        "BGR": {
            "fips": "BU",
            "ppp": 101
        },
        "CHN": {
            "fips": "CH",
            "ppp": 121
        },
        "IDN": {
            "fips": "ID",
            "ppp": 156
        },
        "IND": {
            "fips": "IN",
            "ppp": 164
        },
    }

    tablename = "world_supplement"
    table = SQLTable(tablename, ["year", "country", "pop", "gdp", "ppp"],
                     ["int", "char(3)", "int", "float", "float"]).create()
    table.truncate()

    country_fips = {}
    data = {}
    for (country, info) in countries.items():
        data[country] = {}
        country_fips[info["fips"]] = country

    # this file spec is documented in the xlsx file from the archive
    thisyear = datetime.datetime.now().year
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    if year >= thisyear:  # we don't want future projections
                        continue
                    country = country_fips[fips]
                    data[country][year] = {"pop": int(fields[2])}

    worldbank = {
        "ppp": "NY.GNP.PCAP.PP.CD_Indicator_MetaData_en_EXCEL.xls",
        "gdp": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
    }

    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    # this discards years where we don't have population
                    if year in data[country] and \
                            type(value) is float and value != 0:
                        data[country][year][indicator] = value

    for (country, country_data) in data.items():
        for (year, year_data) in country_data.items():
            ppp = None
            gdp = None
            pop = year_data["pop"]
            if "gdp" in year_data:
                gdp = year_data["gdp"]
            if "ppp" in year_data:
                ppp = year_data["ppp"]

            table.insert([year, country, pop, gdp, ppp])
예제 #47
0
파일: food_sector.py 프로젝트: sonya/eea
import csv

from usa import bea, config, eia, common, wiod_code_map
from common.dbconnect import db
from common import fileutils, utils, sqlhelper

old_meat_codes = ["140101", "140102", "140103", "140105"]
new_meat_codes = ["311611", "311612", "311615", "31161A"]

combined_meat_codes = old_meat_codes + new_meat_codes

for year in config.STUDY_YEARS:
    print(year)

    path = fileutils.getcache("fossil_fuel_estimates_%d.csv" % year, "usa")
    fh = open(path, "r")
    csvf = csv.reader(fh)
    io_codes = common.io_codes_for_year(year)

    data = {}
    row = next(csvf)
    for row in csvf:
        if len(row) == 6:
            sector = row[0]
            btu = row[1] # total
            #btu = row[2] # coal
            #btu = row[3] # natural gas
            #btu = row[5] # PA-nontrans
            data[sector] = float(btu)
예제 #48
0
def doparse():

    tablename = "%s.world_supplement" % config.WIOD_SCHEMA
    table = SQLTable(tablename,
                     ["year", "country", "measurement", "value"],
                     ["int", "char(3)", "varchar(8)", "float"])
    table.create()
    table.truncate()

    # census data has more complete population counts
    country_fips = {
        "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE",
        "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL",
        "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN",
        "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE",
        "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL",
        "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX",
        "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN",
        "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA",
        }
    
    # this file spec is documented in the xlsx file from the archive
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    country = country_fips[fips]
                    table.insert([year, country, "pop", int(fields[2])])

    # worldbank data has some deflator data that imf doesn't
    worldbank = {
        "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls",
        #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls",
        #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls",
        "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls",
        }
    
    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in config.countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    if type(value) is float and value != 0:
                        table.insert([year, country, indicator, value])

    imf_fields = (
        "LP", # population
        "PPPPC", # ppp per capita
        "NGDPRPC", # gdp per capita in constant prices
        "NGDP_D", # gdp deflator
        )

    # this is actually a csv file despite what it's called
    path = fileutils.getcache("WEOApr2012all.xls", "wsupp")

    with codecs.open(path, "r", "cp1252") as fh:
        csvf = csv.reader(fh, dialect=csv.excel_tab)
        header = next(csvf)
        year_cols = {}

        valid_year = re.compile("\d{4}")
        valid_float = re.compile("-*[\d\.,]+")

        for i in range(len(header)):
            if header[i] == "ISO":
                country_col = i
            elif header[i] == "WEO Subject Code":
                subject_col = i
            elif valid_year.match(header[i]):
                year_cols[int(header[i])] = i
            elif header[i] == "Estimates Start After":
                last_year_col = i

        for row in csvf:
            if len(row) > subject_col and row[subject_col] in imf_fields:
                field = row[subject_col]
                country = row[country_col]
                if country not in config.countries:
                    continue
                if valid_year.match(row[last_year_col]):
                    last_year = int(row[last_year_col])
                else:
                    # not clear if this means all values are estimated
                    last_year = 9999
                for (year, colnum) in year_cols.items():
                    value = row[colnum]
                    if valid_float.match(value): #and year < last_year:
                        table.insert([year, country, field,
                                      float(value.replace(",", ""))])
예제 #49
0
파일: parser.py 프로젝트: sonya/eea
def parse_io():
    io_files = {
        1996: "410281134571.xls",
        1999: "4102715414971.xls",
        2001: "4122111363671.xls",
        2004: "611239581071.xls",
        2006: "9121414285971.xls",
        2007: "1139203871.xls",
        2008: "1139204871.xls",
        2009: "11229101502.xls",
        2010: "1122910141371.xls",
        }

    for (year, filename) in io_files.items():
        tablename = "%s.io_%d" % (config.SCHEMA, year)
    
        # millions are in NTD
        table = SQLTable(tablename,
                         ["from_sector", "to_sector", "millions"],
                         ["varchar(255)", "varchar(255)", "float"])
        table.create()
        table.truncate()
    
        path = fileutils.getcache(filename, "tw/%d" % year)
        wb = xlrd.open_workbook(path)
        sheet = wb.sheets()[0]
        to_codes = sheet.row_values(0)
        to_names = sheet.row_values(1)
        for rowindex in range(2, sheet.nrows):
            row = sheet.row_values(rowindex)
            from_code = row[0].strip()
            from_name = row[1].strip()
            for i in range(2, len(to_names)):
                to_name = to_names[i].strip()
                value = row[i]
                table.insert([from_name, to_name, value])

        if year == 2010:
            strings = {
                "viewname": "%s.io_view_%d" % (config.SCHEMA, year),
                "tablename": tablename,
                "maptable": "%s.sector_map_%d" % (config.SCHEMA, year),
                "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]),
                "from_blacklist":
                    sqlhelper.set_repr(config.from_blacklists[year]),
                }

            sql = """CREATE OR REPLACE VIEW %(viewname)s AS
                SELECT from_map.io_sector AS from_sector,
                       to_map.io_sector as to_sector,
                       sum(millions) as millions
                  FROM %(tablename)s io,
                       (SELECT DISTINCT io_sector, io_commod
                          FROM %(maptable)s) from_map,
                       (SELECT DISTINCT io_sector, io_ind
                          FROM %(maptable)s) to_map
                 WHERE io.to_sector NOT IN %(to_blacklist)s
                   AND io.from_sector NOT IN %(from_blacklist)s
                   AND from_map.io_commod = io.from_sector
                   AND to_map.io_ind = io.to_sector
                 GROUP BY from_map.io_sector, to_map.io_sector""" % strings

            print(sql)
            db.execute(sql)