Beispiel #1
0
def parse_codes():
    path = fileutils.getdatapath("sector_map.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()
    for sheet in sheets:
        year = int(sheet.name)
        tablename = "%s.sector_map_%d" % (config.SCHEMA, year)

        if year > 2006: # CxI only, need map between commods and inds
            colnames = ["io_sector", "env_sector", "harmonized_env",
                        "io_commod", "io_ind"]
            coltypes = ["varchar(255)"]*5
        else:
            colnames = ["io_sector", "env_sector", "harmonized_env"]
            coltypes = ["varchar(255)"]*3

        table = SQLTable(tablename, colnames, coltypes)
        table.create()
        table.truncate()
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            io_sector = row[0].strip()
            env_sector = row[1].strip()
            harmonized_env = row[2]
            if type(harmonized_env) is float:
                harmonized_env = str(int(harmonized_env))
            harmonized_env = harmonized_env.strip()
            if year > 2006:
                io_commod = row[4].strip()
                io_ind = row[5].strip()
                table.insert([io_sector, env_sector, harmonized_env,
                              io_commod, io_ind])
            else:
                table.insert([io_sector, env_sector, harmonized_env])
Beispiel #2
0
class TableStateTracker:

    def __init__(self):
        self.xact = None
        self.table = None

    def drop_table(self, tablename, cascade=False):
        self.table = SQLTable(tablename)
        self.table.drop(cascade)

    def create_table(self, tablename, cols, coltypes, cascade=False):
        self.flush()
        self.table = SQLTable(tablename, cols, coltypes)
        self.table.drop(cascade)
        self.table.create()
        self.warmup()

    def insert_row(self, values):
        self.table.insert(values)
        #self.current_stmt(*values)

    def warmup(self):
        self.xact = db.xact(mode="READ WRITE")
        self.xact.begin()

    def flush(self):
        if self.xact is not None:
            self.xact.commit()
Beispiel #3
0
class TableStateTracker:
    def __init__(self):
        self.xact = None
        self.table = None

    def drop_table(self, tablename, cascade=False):
        self.table = SQLTable(tablename)
        self.table.drop(cascade)

    def create_table(self, tablename, cols, coltypes, cascade=False):
        self.flush()
        self.table = SQLTable(tablename, cols, coltypes)
        self.table.drop(cascade)
        self.table.create()
        self.warmup()

    def insert_row(self, values):
        self.table.insert(values)
        #self.current_stmt(*values)

    def warmup(self):
        self.xact = db.xact(mode="READ WRITE")
        self.xact.begin()

    def flush(self):
        if self.xact is not None:
            self.xact.commit()
Beispiel #4
0
class CodeTracker:
    def __init__(self, name):
        self.name = name
        self.table = SQLTable(
            "%s.%s" % (config.WIOD_SCHEMA, name),
            ["code", "description"],
            ["varchar(15)", "varchar(255)"])
        self.code_dict = None

    def setup(self):
        self.table.create()
        self.get_codes()

    # get existing codes from db
    def get_codes(self):
        if self.code_dict is None:
            self.code_dict = {}
        for (code, desc) in self.table.getall():
            self.code_dict[code] = desc

    def get_desc_for_code(self, code):
        if code in self.code_dict:
            return self.code_dict[code]
        return None

    # returns the code used if it was recognized, false otherwise
    def set_code(self, code, desc):
        if type(code) is str:
            code = code.strip()
        elif type(code) is float:
            code = str(int(code))

        if type(desc) is str:
            desc = desc.strip()

        if code is None or not len(code):
            if desc is None or not len(desc): # ignore empty args
                return False
            elif desc in config.fd_sectors: # choose manual codes
                code = config.fd_sectors[desc]
            elif desc in config.va_sectors: # choose manual codes
                code = config.va_sectors[desc]
            else:
                return False
        elif code in config.code_blacklist: # ignore invalid values for codes
            return False

        if code in self.code_dict and self.code_dict[code] != desc:
            print(self.code_dict[code], desc)
        self.code_dict[code] = desc

        return code

    def update_codes(self):
        self.table.truncate()
        for code in sorted(self.code_dict.keys()):
            desc = self.code_dict[code]
            self.table.insert([code, desc])
Beispiel #5
0
def doparse():
    country_dict = dict((v, k) for k, v in config.countries.items())
    country_dict["Slovakia"] = "SVK"

    sources = ["total", "nuclear", "thermal", "renewable",
               "geothermal", "solar", "wind", "biomass"]
    measurements = ["capacity", "consumption"]

    tablename = "%s.world_power" % ("eia")
    table = SQLTable(
        tablename,
        ["year", "country", "source", "units", "value"],
        ["int", "char(3)", "varchar(15)", "varchar(4)", "float"])
    table.create()
    table.truncate()

    for source in sources:
        for measure in measurements:
            if measure == "consumption":
                if source in ("geothermal", "solar", "wind", "biomass"):
                    continue

                units = "bkWh"
            elif measure == "capacity":
                units = "MkW"

            filename = source + "_" + measure + ".xls"
            path = fileutils.getcache(filename, "eia")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_index(0)
            header = None
            for i in range(sheet.nrows):
                row = sheet.row_values(i)
                if header is None:
                    if len(row) > 2 and type(row[2]) is float:
                        header = []
                        for cell in row:
                            if type(cell) is float:
                                header.append(int(cell))
                            else:
                                header.append(None)
                        header_len = len(header)
                elif len(row) > 2:
                    country_name = row[0]
                    if country_name in country_dict:
                        country = country_dict[country_name]
                        for i in range(2, header_len):
                            value = row[i]
                            year = header[i]
                            if type(value) is float and value > 0:
                                table.insert(
                                    [year, country, source, units, value])
Beispiel #6
0
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename, ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print("%d rows inserted" % insert_count)
Beispiel #7
0
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename,
                          ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print ("%d rows inserted" % insert_count)
Beispiel #8
0
def parse_codes():
    path = fileutils.getdatapath("sector_map.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    sheets = wb.sheets()
    for sheet in sheets:
        year = int(sheet.name)
        tablename = "%s.sector_map_%d" % (config.SCHEMA, year)

        if year > 2006:  # CxI only, need map between commods and inds
            colnames = [
                "io_sector", "env_sector", "harmonized_env", "io_commod",
                "io_ind"
            ]
            coltypes = ["varchar(255)"] * 5
        else:
            colnames = ["io_sector", "env_sector", "harmonized_env"]
            coltypes = ["varchar(255)"] * 3

        table = SQLTable(tablename, colnames, coltypes)
        table.create()
        table.truncate()
        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            io_sector = row[0].strip()
            env_sector = row[1].strip()
            harmonized_env = row[2]
            if type(harmonized_env) is float:
                harmonized_env = str(int(harmonized_env))
            harmonized_env = harmonized_env.strip()
            if year > 2006:
                io_commod = row[4].strip()
                io_ind = row[5].strip()
                table.insert(
                    [io_sector, env_sector, harmonized_env, io_commod, io_ind])
            else:
                table.insert([io_sector, env_sector, harmonized_env])
Beispiel #9
0
class IOTableStateTracker(TableStateTracker):

    def __init__(self):
        TableStateTracker.__init__(self)
        
        self.make_table = None
        self.use_table = None

        self.make_insert_count = 0
        self.use_insert_count = 0

    def flush(self):
        TableStateTracker.flush(self)

        if self.make_insert_count:
            print("%d rows inserted to make table"
                  % self.make_insert_count)
            self.make_insert_count = 0
        if self.use_insert_count:
            print("%d rows inserted to use table"
                  % self.use_insert_count)
            self.use_insert_count = 0

    def create_make_table(self, year):
        print("creating make table for %s..." % year)

        tablename = "%s.make_%s" % (config.IO_SCHEMA, year)
        self.make_table = SQLTable(tablename,
                          ["industry", "commodity", "thousands"],
                          ["varchar(6)", "varchar(6)", "bigint"])
        self.make_table.create()
        self.make_table.truncate()

    def create_use_table(self, year, has_margins=False):
        print("creating use table for %s..." % year)

        cols = ["commodity", "industry", "thousands"]
        coltypes = ["varchar(6)", "varchar(6)", "bigint"]
        if has_margins:
            for field in bea.use_table_margins:
                cols.append(field)
                coltypes.append("int")

        tablename = "%s.use_%s" % (config.IO_SCHEMA, year)
        self.use_table = SQLTable(tablename, cols, coltypes)
        self.use_table.create()
        self.use_table.truncate()

    def insert_make(self, indus, commod, makeval, factor=1):
        value = float(makeval) * factor
        if (value != 0):
            self.make_table.insert([indus.strip(),commod.strip(), int(value)])
            self.make_insert_count += 1

    def insert_use(self, commod, indus, useval,
                   margins={}, factor=1):

        useval = float(useval) * factor
        nonzero = useval

        values = [commod.strip(), indus.strip(), int(useval)]
        if len(margins) > 0:
            for margin_field in bea.use_table_margins:
                value = 0
                if margin_field in margins:
                    value = float(margins[margin_field]) * factor
                    if value:
                        nonzero += value
                values.append(value)

        if nonzero != 0:
            self.use_table.insert(values)
            self.use_insert_count += 1

    # this is for years with no distinction between
    # make and use tables
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename,
                          ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print ("%d rows inserted" % insert_count)

    # this is for years that have make and use but no margins
    def create_simple_make_use(self, year, filename, factor=1):
        self.create_make_table(year)
        self.create_use_table(year, has_margins=False)
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) == 4:
                    input_ind = cols[0]    # comm consumed (producing ind)
                    output_ind = cols[1]   # consuming ind (comm produced) 
                    use_dollars = cols[2]  # use in producers' prices
                    make_dollars = cols[3] # make in producers' prices

                self.insert_make(input_ind, output_ind, make_dollars, factor)
                self.insert_use(commod=input_ind, indus=output_ind,
                                useval=use_dollars, factor=factor)
Beispiel #10
0
def parse_io():
    io_files = {
        1996: "410281134571.xls",
        1999: "4102715414971.xls",
        2001: "4122111363671.xls",
        2004: "611239581071.xls",
        2006: "9121414285971.xls",
        2007: "1139203871.xls",
        2008: "1139204871.xls",
        2009: "11229101502.xls",
        2010: "1122910141371.xls",
        }

    for (year, filename) in io_files.items():
        tablename = "%s.io_%d" % (config.SCHEMA, year)
    
        # millions are in NTD
        table = SQLTable(tablename,
                         ["from_sector", "to_sector", "millions"],
                         ["varchar(255)", "varchar(255)", "float"])
        table.create()
        table.truncate()
    
        path = fileutils.getcache(filename, "tw/%d" % year)
        wb = xlrd.open_workbook(path)
        sheet = wb.sheets()[0]
        to_codes = sheet.row_values(0)
        to_names = sheet.row_values(1)
        for rowindex in range(2, sheet.nrows):
            row = sheet.row_values(rowindex)
            from_code = row[0].strip()
            from_name = row[1].strip()
            for i in range(2, len(to_names)):
                to_name = to_names[i].strip()
                value = row[i]
                table.insert([from_name, to_name, value])

        if year == 2010:
            strings = {
                "viewname": "%s.io_view_%d" % (config.SCHEMA, year),
                "tablename": tablename,
                "maptable": "%s.sector_map_%d" % (config.SCHEMA, year),
                "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]),
                "from_blacklist":
                    sqlhelper.set_repr(config.from_blacklists[year]),
                }

            sql = """CREATE OR REPLACE VIEW %(viewname)s AS
                SELECT from_map.io_sector AS from_sector,
                       to_map.io_sector as to_sector,
                       sum(millions) as millions
                  FROM %(tablename)s io,
                       (SELECT DISTINCT io_sector, io_commod
                          FROM %(maptable)s) from_map,
                       (SELECT DISTINCT io_sector, io_ind
                          FROM %(maptable)s) to_map
                 WHERE io.to_sector NOT IN %(to_blacklist)s
                   AND io.from_sector NOT IN %(from_blacklist)s
                   AND from_map.io_commod = io.from_sector
                   AND to_map.io_ind = io.to_sector
                 GROUP BY from_map.io_sector, to_map.io_sector""" % strings

            print(sql)
            db.execute(sql)
Beispiel #11
0
class IOTableStateTracker(TableStateTracker):
    def __init__(self):
        TableStateTracker.__init__(self)

        self.make_table = None
        self.use_table = None

        self.make_insert_count = 0
        self.use_insert_count = 0

    def flush(self):
        TableStateTracker.flush(self)

        if self.make_insert_count:
            print("%d rows inserted to make table" % self.make_insert_count)
            self.make_insert_count = 0
        if self.use_insert_count:
            print("%d rows inserted to use table" % self.use_insert_count)
            self.use_insert_count = 0

    def create_make_table(self, year):
        print("creating make table for %s..." % year)

        tablename = "%s.make_%s" % (config.IO_SCHEMA, year)
        self.make_table = SQLTable(tablename,
                                   ["industry", "commodity", "thousands"],
                                   ["varchar(6)", "varchar(6)", "bigint"])
        self.make_table.create()
        self.make_table.truncate()

    def create_use_table(self, year, has_margins=False):
        print("creating use table for %s..." % year)

        cols = ["commodity", "industry", "thousands"]
        coltypes = ["varchar(6)", "varchar(6)", "bigint"]
        if has_margins:
            for field in bea.use_table_margins:
                cols.append(field)
                coltypes.append("int")

        tablename = "%s.use_%s" % (config.IO_SCHEMA, year)
        self.use_table = SQLTable(tablename, cols, coltypes)
        self.use_table.create()
        self.use_table.truncate()

    def insert_make(self, indus, commod, makeval, factor=1):
        value = float(makeval) * factor
        if (value != 0):
            self.make_table.insert([indus.strip(), commod.strip(), int(value)])
            self.make_insert_count += 1

    def insert_use(self, commod, indus, useval, margins={}, factor=1):

        useval = float(useval) * factor
        nonzero = useval

        values = [commod.strip(), indus.strip(), int(useval)]
        if len(margins) > 0:
            for margin_field in bea.use_table_margins:
                value = 0
                if margin_field in margins:
                    value = float(margins[margin_field]) * factor
                    if value:
                        nonzero += value
                values.append(value)

        if nonzero != 0:
            self.use_table.insert(values)
            self.use_insert_count += 1

    # this is for years with no distinction between
    # make and use tables
    def create_simple_transaction_table(self, year, filename, factor=1):
        print("creating transations table for %s..." % year)

        tablename = "%s.transactions_%s" % (config.IO_SCHEMA, year)
        xtable = SQLTable(tablename, ["producer", "consumer", "thousands"],
                          ["varchar(6)", "varchar(6)", "int"])
        xtable.create()
        xtable.truncate()

        insert_count = 0
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) >= 3:
                    value = float(cols[2]) * factor
                    if (value != 0):
                        xtable.insert([cols[0], cols[1], int(value)])
                        insert_count += 1

        print("%d rows inserted" % insert_count)

    # this is for years that have make and use but no margins
    def create_simple_make_use(self, year, filename, factor=1):
        self.create_make_table(year)
        self.create_use_table(year, has_margins=False)
        with open(fileutils.getcache(filename), "r") as f:
            for line in f:
                cols = line.split()
                if len(cols) == 4:
                    input_ind = cols[0]  # comm consumed (producing ind)
                    output_ind = cols[1]  # consuming ind (comm produced)
                    use_dollars = cols[2]  # use in producers' prices
                    make_dollars = cols[3]  # make in producers' prices

                self.insert_make(input_ind, output_ind, make_dollars, factor)
                self.insert_use(commod=input_ind,
                                indus=output_ind,
                                useval=use_dollars,
                                factor=factor)
Beispiel #12
0
Datei: bts.py Projekt: sonya/eea
def doparse():
    carrier_countries = {
        #"-": "", # Unknown
        "1I": "USA", # Sky Trek International Airlines
        "2T": "CAN", # Canada 3000 Airlines Ltd.
        "3Z": "USA", # Tatonduk Outfitters Limited d/b/a Everts Air Alaska and Everts Air Cargo
        "5X": "USA", # United Parcel Service
        "5Y": "USA", # Atlas Air Inc.
        "6F": "GBR", # Laker Airways Inc.
        #"6U": "", # Air Ukraine
        #"6Y": "", # Nicaraguense De Aviacion Sa
        #"7P": "", # Apa International Air S.A. (dominican rep)
        #"7Z": "", # Lb Limited
        "8C": "USA", # Air Transport International
        "AA": "USA", # American Airlines Inc.
        "AC": "CAN", # Air Canada
        #"ADB": "", # Antonov Company (ukraine)
        "AF": "FRA", # Compagnie Nat'l Air France
        "AI": "IND", # National Aviation Company of India Limited d/b/a Air India
        "AM": "MEX", # Aeromexico
        #"AQQ": "", # Air Charter (Safa)
        #"AR": "", # Aerolineas Argentinas
        "AS": "USA", # Alaska Airlines Inc.
        #"AT": "", # Royal Air Maroc (morocco)
        #"AV": "", # Aerovias Nac'l De Colombia
        "AY": "FIN", # Finnair Oy
        "AZ": "ITA", # Compagnia Aerea Italiana
        #"All Rows": "", # All Rows (including those not displayed)
        "BA": "GBR", # British Airways Plc
        #"BBQ": "", # Balair Ag (swiss)
        "BCQ": "CAN", # Bradley Air Services Ltd.
        #"BG": "", # Biman Bangladesh Airlines
        "BQ": "MEX", # Aeromar C. Por A.
        "BR": "TWN", # Eva Airways Corporation
        #"BW": "", # Caribbean Airlines Limited (trinidad and tobago)
        "BY": "GBR", # Britannia Airways Ltd.
        "CA": "CHN", # Air China
        #"CC": "", # Air Atlanta Icelandic
        "CDQ": "USA", # Kitty Hawk International
        #"CF": "", # Compan. De Aviacion Faucett (peru)
        "CI": "TWN", # China Airlines Ltd.
        #"CLQ": "", # Aero Transcolombiana
        #"CM": "", # Compania Panamena (Copa)
        "CO": "USA", # Continental Air Lines Inc.
        "CP (1)": "CAN", # Canadian Airlines International Ltd.
        "CS": "USA", # Continental Micronesia
        "CV": "LUX", # Cargolux Airlines International S.A
        #"CVQ": "", # Caraven S.A.
        #"CX": "", # Cathay Pacific Airways Ltd. (hong kong, includes pre 1997)
        "CYQ": "FRA", # Corse Air International (assuming corsair)
        "CZ": "CHN", # China Southern Airlines
        "DE": "DEU", # Condor Flugdienst
        "DHQ": "GBR", # DHL Aero Expresso
        "DL": "USA", # Delta Air Lines Inc.
        #"ED": "", # Andes (ecuador or argentina)
        "EH": "ESP", # Saeta Airlines
        "EI": "IRL", # Aer Lingus Plc
        #"EOQ": "", # Aeroservicios Ecuatorianos
        "ER": "USA", # Astar USA, LLC
        #"EU": "", # Ecuatoriana De Aviacion
        #"EXQ": "", # Export Air Del Peru S.A.
        "EZ": "TWN", # Evergreen International Inc.
        "F9": "USA", # Frontier Airlines Inc.
        "FCQ": "USA", # Falcon Air Express
        #"FF": "", # Tower Air Inc.
        #"FI": "", # Icelandair
        #"FJ": "", # Air Pacific Ltd. (fiji)
        "FNQ": "USA", # Fine Airlines Inc.
        #"FQ": "", # Air Aruba
        #"FS": "", # Serv De Trans Aereos Fuegui (argentina)
        "FX": "USA", # Federal Express Corporation
        #"G3": "", # Aerochago S.A.
        "GA": "IDN", # P.T. Garuda Indonesian Arwy
        "GD": "MEX", # Transp. Aereos Ejecutivos
        #"GF": "", # Gulf Air Company (bahrain)
        #"GH": "", # Ghana Airways Corporation
        "GJ (1)": "MEX", # Mexicargo
        "GL": "USA", # Miami Air International
        "GR": "USA", # Gemini Air Cargo Airways
        #"GU": "", # Aviateca (guatemala)
        #"GY": "", # Guyana Airways Corporation
        "H2": "BEL", # City Bird
        "H5": "RUS", # Magadan Airlines
        "HA": "USA", # Hawaiian Airlines Inc.
        "HAQ": "DEU", # Hapag Lloyd Flug.
        "HCQ": "USA", # Av Atlantic
        #"HFQ": "", # Haiti Air Freight Intl
        "HLQ": "AUS", # Heavylift Cargo Airlines Lt
        "HP": "USA", # America West Airlines Inc. (Merged with US Airways 9/05. Stopped reporting 10/07.)
        #"HY": "", # Uzbekistan Airways
        "IB": "ESP", # Iberia Air Lines Of Spain
        #"ITQ": "", # Interamericana De Aviacion (uruguay)
        "IW": "FRA", # Air Liberte Aka Aom Minerve
        #"JAQ": "", # Jamaica Air Freighters
        "JD": "JPN", # Japan Air System Co. Ltd.
        "JI (1)": "USA", # Midway Airlines Inc.
        "JK": "ESP", # Spanair S.A.
        "JKQ": "USA", # Express One International Inc.
        "JL": "JPN", # Japan Air Lines Co. Ltd.
        #"JM": "", # Air Jamaica Limited
        "JR": "USA", # Aero California
        "JW": "CAN", # Arrow Air Inc.
        "JZ": "JPN", # Japan Air Charter Co. Ltd.
        "K8 (1)": "NLD", # Dutch Caribbean Airlines
        "KE": "KOR", # Korean Air Lines Co. Ltd.
        "KH": "USA", # Aloha Air Cargo
        #"KI": "", # Time Air Ltd. (south africa)
        "KL": "NLD", # Klm Royal Dutch Airlines
        #"KP": "", # Kiwi International
        "KR": "USA", # Kitty Hawk Aircargo
        "KTQ": "TUR", # Turks Air Ltd.
        #"KU": "", # Kuwait Airways Corp.
        "KW": "USA", # Carnival Air Lines Inc.
        #"KX": "", # Cayman Airways Limited
        "KZ": "JPN", # Nippon Cargo Airlines
        #"LA": "", # Lan-Chile Airlines
        #"LB": "", # Lloyd Aereo Boliviano S. A.
        "LGQ": "MEX", # Lineas Aereas Allegro
        "LH": "DEU", # Lufthansa German Airlines
        "LO": "POL", # Polskie Linie Lotnicze
        #"LR": "", # Lacsa (costa rica)
        #"LSQ": "", # Lineas Aereas Suramerican (colombia)
        "LT": "DEU", # Luftransport-Unternehmen
        #"LU": "", # Air Atlantic Dominicana
        #"LY": "", # El Al Israel Airlines Ltd.
        "LZ": "BGR", # Balkan Bulgarian Airlines
        "M6": "USA", # Amerijet International
        "M7": "MEX", # Aerotransportes Mas De Crga
        "MA": "HUN", # Malev Hungarian Airlines
        "MG": "USA", # Champion Air
        #"MH": "", # Malaysian Airline System
        #"ML": "", # Aero Costa Rica
        "MP": "NLD", # Martinair Holland N.V.
        #"MS": "", # Egyptair
        "MT": "GBR", # Thomas Cook Airlines Uk Ltd.
        "MT (1)": "GBR", # Flying Colours Airlines Ltd.
        "MU": "CHN", # China Eastern Airlines
        #"MUQ": "", # Aerolineas Mundo (columbia)
        "MX": "MEX", # Compania Mexicana De Aviaci
        #"MYQ": "", # Lineas Aereas Mayas (Lamsa)
        #"N5 (1)": "", # Nations Air Express Inc.
        "NA": "USA", # North American Airlines
        "NG": "DEU", # Lauda Air Luftfahrt Ag
        "NH": "JPN", # All Nippon Airways Co.
        "NK": "USA", # Spirit Air Lines
        "NW": "USA", # Northwest Airlines Inc.
        "NWQ": "USA", # N. W. Territorial Airways
        #"NZ": "", # Air New Zealand
        "OA": "GRC", # Olympic Airways
        #"OI": "", # Prestige Airways (uae)
        "OK": "CZE", # Czech Airlines
        #"ON": "", # Air Nauru
        "OS": "AUT", # Austrian Airlines
        "OW": "USA", # Executive Airlines
        "OZ": "KOR", # Asiana Airlines Inc.
        "PA (2)": "USA", # Pan American World Airways
        "PCQ": "USA", # Pace Airlines
        #"PIQ": "", # Pacific International Airlines (ambiguous: usa, panama)
        #"PK": "", # Pakistan International Airlines
        #"PL": "", # Aero Peru
        "PNQ": "USA", # Panagra Airways
        "PO": "USA", # Polar Air Cargo Airways
        #"PR": "", # Philippine Airlines Inc.
        "PRQ": "USA", # Florida West Airlines Inc.
        "PT": "USA", # Capital Cargo International
        #"PY": "", # Surinam Airways Limited
        "Q7": "BEL", # Sobelair
        "QF": "AUS", # Qantas Airways Ltd.
        "QK": "CAN", # Jazz Aviation LP
        #"QN": "", # Royal Air (ambiguous)
        "QO": "MEX", # Aeromexpress
        "QQ": "USA", # Reno Air Inc.
        #"QT": "", # Transportes Aereos Mercantiles Panamericanos S.A (colombia)
        "QTQ": "IRL", # Aer Turas Teoranta
        "QX": "USA", # Horizon Air
        "RD": "USA", # Ryan International Airlines
        "REQ": "USA", # Renown Aviation
        "RG": "BRA", # Varig S. A.
        #"RJ": "", # Alia-(The) Royal Jordanian
        #"RK": "", # Air Afrique
        "RNQ": "GBR", # Mytravel Airways
        "RO": "ROU", # Tarom Romanian Air Transpor
        #"SA": "", # South African Airways
        "SAQ": "USA", # Southern Air Transport Inc.
        "SEQ": "GBR", # Sky Service F.B.O.
        "SIQ": "LUX", # Premiair
        "SK": "SWE", # Scandinavian Airlines Sys.
        "SM": "USA", # Sunworld International Airlines
        "SN (1)": "BEL", # Sabena Belgian World Air.
        "SPQ": "USA", # Sun Pacific International
        #"SQ": "", # Singapore Airlines Ltd.
        #"SR": "", # Swissair Transport Co. Ltd.
        "SU": "RUS", # Aeroflot Russian Airlines
        #"SV": "", # Saudi Arabian Airlines Corp
        "SX (1)": "MEX", # Aeroejecutivo S.A.
        "SY": "USA", # Sun Country Airlines d/b/a MN Airlines
        "T9": "USA", # TransMeridian Airlines
        #"TA": "", # Taca International Airlines (el savador)
        "TCQ": "USA", # Express.Net Airlines
        #"TG": "", # Thai Airways International Ltd.
        "TK": "TUR", # Turk Hava Yollari A.O.
        "TKQ": "USA", # Trans-Air-Link Corporation
        "TNQ": "USA", # Emery Worldwide Airlines
        "TP": "PRT", # Tap-Portuguese Airlines
        "TR": "BRA", # Transbrasil S.A.
        "TRQ": "SWE", # Blue Scandinavia Ab
        "TS": "CAN", # Air Transat
        "TW": "USA", # Trans World Airways LLC
        #"TZ": "", # ATA Airlines d/b/a ATA (iran)
        "TZQ": "GBR", # First Choice Airways
        "U7": "USA", # USA Jet Airlines Inc.
        "UA": "USA", # United Air Lines Inc.
        #"UD": "", # Fast Air Carrier Ltd.
        "UN": "RUS", # Transaero Airlines
        #"UP": "", # Bahamasair Holding Limited
        "US": "USA", # US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.)
        "UX": "ESP", # Air Europa
        #"UYQ": "", # Aerolineas Uruguayas S.A.
        #"VA (1)": "", # Venezuelan International Airways
        #"VC": "", # Servicios Avensa (venezuela)
        #"VE": "", # Aerovias Venezolanas-Avensa
        "VIQ": "RUS", # Volga-Dnepr Airlines
        "VP": "BRA", # Viacao Aerea Sao Paulo
        #"VR": "", # Transportes Aereos De Cabo (cape verde)
        "VS": "GBR", # Virgin Atlantic Airways
        #"VX (1)": "", # Aces Airlines (colombia)
        #"W7": "", # Western Pacific Airlines (solomon islands)
        #"WD": "", # Halisa Air (haiti)
        "WE": "USA", # Centurion Cargo Inc.
        "WO": "USA", # World Airways Inc.
        #"XC": "", # Air Caribbean (1)
        "XE": "USA", # ExpressJet Airlines Inc. (1)
        "XJ": "USA", # Mesaba Airlines
        "XP": "USA", # Casino Express
        "YX (1)": "USA", # Midwest Airline, Inc.
        "ZB": "USA", # Monarch Airlines
        #"ZUQ": "", # Zuliana De Aviacion (venezuela)
        "ZX (1)": "CAN", # Airbc Ltd.
        }

    tablename = "air_carriers"
    table = SQLTable(
        tablename,
        ["year", "carrier", "series", "value"],
        ["int", "varchar(15)", "varchar(15)", "int"])
    table.create()
    table.truncate()

    carriers = {}

    for year in config.STUDY_YEARS:
        for filestem in ["freight", "passengers"]:
            filename = filestem + str(year) + ".csv"
            path = fileutils.getcache(filename, "bts")
            with open(path) as fh:
                csvf = csv.reader(fh)
                next(csvf)
                header = next(csvf)
                for row in csvf:
                    if len(row) == 3:
                        carrier = row[0]
                        #carrier_name = row[1]
                        if carrier in carrier_countries:
                            country = carrier_countries[carrier]
                            value = int(row[2])
                            table.insert([year, country, filestem, value])
Beispiel #13
0
def parse_io():
    io_files = {
        1996: "410281134571.xls",
        1999: "4102715414971.xls",
        2001: "4122111363671.xls",
        2004: "611239581071.xls",
        2006: "9121414285971.xls",
        2007: "1139203871.xls",
        2008: "1139204871.xls",
        2009: "11229101502.xls",
        2010: "1122910141371.xls",
    }

    for (year, filename) in io_files.items():
        tablename = "%s.io_%d" % (config.SCHEMA, year)

        # millions are in NTD
        table = SQLTable(tablename, ["from_sector", "to_sector", "millions"],
                         ["varchar(255)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        path = fileutils.getcache(filename, "tw/%d" % year)
        wb = xlrd.open_workbook(path)
        sheet = wb.sheets()[0]
        to_codes = sheet.row_values(0)
        to_names = sheet.row_values(1)
        for rowindex in range(2, sheet.nrows):
            row = sheet.row_values(rowindex)
            from_code = row[0].strip()
            from_name = row[1].strip()
            for i in range(2, len(to_names)):
                to_name = to_names[i].strip()
                value = row[i]
                table.insert([from_name, to_name, value])

        if year == 2010:
            strings = {
                "viewname": "%s.io_view_%d" % (config.SCHEMA, year),
                "tablename": tablename,
                "maptable": "%s.sector_map_%d" % (config.SCHEMA, year),
                "to_blacklist": sqlhelper.set_repr(config.to_blacklists[year]),
                "from_blacklist":
                sqlhelper.set_repr(config.from_blacklists[year]),
            }

            sql = """CREATE OR REPLACE VIEW %(viewname)s AS
                SELECT from_map.io_sector AS from_sector,
                       to_map.io_sector as to_sector,
                       sum(millions) as millions
                  FROM %(tablename)s io,
                       (SELECT DISTINCT io_sector, io_commod
                          FROM %(maptable)s) from_map,
                       (SELECT DISTINCT io_sector, io_ind
                          FROM %(maptable)s) to_map
                 WHERE io.to_sector NOT IN %(to_blacklist)s
                   AND io.from_sector NOT IN %(from_blacklist)s
                   AND from_map.io_commod = io.from_sector
                   AND to_map.io_ind = io.to_sector
                 GROUP BY from_map.io_sector, to_map.io_sector""" % strings

            print(sql)
            db.execute(sql)
Beispiel #14
0
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001

    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
        },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
        },
        "waste_pol": {
            "filename":
            "IO_waste.xls",
            "columns": [
                "Total waste", "General waste", "Hazardous waste",
                "Total waste - improper disposal",
                "General waste - improper disposal",
                "Hazardous waste - improper disposal"
            ],
        },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
        },
    }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename, ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]

        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file:  # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i + 5]])

    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files

    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
    }

    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename, ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)

            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range(1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])

                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True
Beispiel #15
0
class SectorCodes:

    def __init__(self, codetablename, mode="r"):
        self.mode = mode

        self.codetable = SQLTable(
            codetablename,
            ["code", "description"],
            ["varchar(15)", "varchar(255)"])

        self.code_dict = {}
        self.reverse_code_dict = {}

        self.setup()

    def setup(self):
        if self.mode == "w":
            # invalid codes or codes that we don't want to record
            self.code_blacklist = []

            # if we want to override the code provided with something
            # we make up (or from another set) based on the description
            self.manual_codes = {}

            self.codetable.create()

        # get existing codes from db
        for (code, desc) in self.codetable.getall():
            self.code_dict[code] = desc
            self.reverse_code_dict[desc] = code

        return self

    # for write mode
    def blacklist_code(self, code):
        self.code_blacklist.append(code)

        if code in self.code_dict:
            del self.code_dict[code]

    def set_blacklist(self, code_blacklist):
        self.code_blacklist = []
        for code in code_blacklist:
            self.blacklist_code(code)

    def curate_code_from_desc(self, desc, code):
        self.manual_codes[desc] = code

        self.code_dict[code] = desc
        self.reverse_code_dict[desc] = code

    def add_curated_codes(self, curated_codes):
        for (desc, code) in curated_codes.items():
            self.curate_code_from_desc(desc, code)

    # returns the code used if it was recognized, false otherwise
    def set_code(self, code, desc):
        if type(code) is str:
            code = code.strip()
        elif type(code) is float:
            code = str(int(code))

        if type(desc) is str:
            desc = desc.strip()

        if desc in self.manual_codes:
            code = self.manual_codes[desc]

        if code is None or not len(code):
            if desc is None or not len(desc): # ignore empty args
                return False
            else:
                return False
        elif code in self.code_blacklist:
            return False

        if code in self.code_dict and self.code_dict[code] != desc:
            # this is to check for blatant differences
            print(self.code_dict[code], "=>", desc)
        self.code_dict[code] = desc

        # there may be more than one description for the same code
        self.reverse_code_dict[desc] = code

        return code

    def has_code(self, code):
        return code in self.code_dict

    def get_code_for_title(self, desc):
        if desc in self.reverse_code_dict:
            return self.reverse_code_dict[desc]

    def get_title_for_code(self, code):
        if self.has_code(code):
            return self.code_dict[code]
        return False

    def update_codes(self):
        if self.mode != "w":
            raise Exception("SectorCodes created in read-only mode")

        self.codetable.truncate()
        for code in sorted(self.code_dict.keys()):
            desc = self.code_dict[code]
            self.codetable.insert([code, desc])
Beispiel #16
0
 def create_table(self, tablename, cols, coltypes, cascade=False):
     table = SQLTable(tablename, cols, coltypes)
     table.drop(cascade)
     table.create()
     self.tables[tablename] = table
Beispiel #17
0
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue

        db_table = SQLTable(
            "cn.emissions_%d" % year,
            ["industry_zh", "industry_en", "pollutant", "amount"],
            ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()

        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]

            for (pollutant, amount) in zip(columns[max_sector_column + 1:],
                                           rowdata[max_sector_column + 1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])

        xact = db.xact(mode="READ WRITE")
        xact.begin()

        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb")  # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)

            print(adir, filename)
            title = soup.title.string

            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break

            columns = None
            did_have_numbers = False  # true after we've parsed through
            max_sector_column = 0  # 1 if english separate, 0 otherwise

            prev_rowdata = None
            prev_rowspans = None
            data = []

            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None

            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue

                rowspans = []
                rowdata = []

                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None

                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)

                for cellpos in range(rowlen):
                    cell = cells[cellpos]

                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])

                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')

                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]

                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(
                                -abs(abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)

                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)

                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan:  # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)

                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()

                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans

                if len(rowdata) == 0:
                    continue

                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue

                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0:  # all blank strings
                    continue

                # if we're sure we have columns, clean up rowdata so
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1] * len(rowspans)

                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break

                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i - 1] + " - "

                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]

                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])

                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None

                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")

                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1

                elif insert_now is not None and len(insert_now) == len(
                        columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)

            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)

            print(columns)

        xact.commit()
Beispiel #18
0
 def create_table(self, tablename, cols, coltypes, cascade=False):
     table = SQLTable(tablename, cols, coltypes)
     table.drop(cascade)
     table.create()
     self.tables[tablename] = table
Beispiel #19
0
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue
    
        db_table = SQLTable("cn.emissions_%d" % year,
                            ["industry_zh", "industry_en",
                             "pollutant", "amount"],
                            ["varchar(1023)", "varchar(1023)",
                             "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()
    
        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]
    
            for (pollutant, amount) in zip(columns[max_sector_column+1:],
                                           rowdata[max_sector_column+1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])
    
        xact = db.xact(mode="READ WRITE")
        xact.begin()
    
        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb") # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)
    
            print(adir, filename)
            title = soup.title.string
    
            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break
    
            columns = None
            did_have_numbers = False # true after we've parsed through
            max_sector_column = 0 # 1 if english separate, 0 otherwise
    
            prev_rowdata = None
            prev_rowspans = None
            data = []
    
            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None
    
            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue
    
                rowspans = []
                rowdata = []
    
                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None
    
                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)
    
                for cellpos in range(rowlen):
                    cell = cells[cellpos]
    
                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])
    
                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')
    
                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]
    
                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(-abs(
                                    abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)
    
                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)
    
                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan: # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)
    
                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()
    
                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans
    
                if len(rowdata) == 0:
                    continue
    
                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue
    
                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0: # all blank strings
                    continue
    
                # if we're sure we have columns, clean up rowdata so 
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1]*len(rowspans)
    
                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break
    
                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i-1] + " - "
    
                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]
    
                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])
    
                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None
    
                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")
    
                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1
    
                elif insert_now is not None and len(insert_now) == len(columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)
    
            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)
    
            print(columns)
    
        xact.commit()
Beispiel #20
0
def doparse():

    tablename = "%s.world_supplement" % config.WIOD_SCHEMA
    table = SQLTable(tablename, ["year", "country", "measurement", "value"],
                     ["int", "char(3)", "varchar(8)", "float"])
    table.create()
    table.truncate()

    # census data has more complete population counts
    country_fips = {
        "LU": "LUX",
        "US": "USA",
        "NL": "NLD",
        "AU": "AUT",
        "SW": "SWE",
        "CA": "CAN",
        "AS": "AUS",
        "EI": "IRL",
        "GM": "DEU",
        "BE": "BEL",
        "TW": "TWN",
        "DA": "DNK",
        "UK": "GBR",
        "FR": "FRA",
        "JA": "JPN",
        "KS": "KOR",
        "SP": "ESP",
        "CY": "CYP",
        "SI": "SVN",
        "EZ": "CZE",
        "GR": "GRC",
        "MT": "MLT",
        "PO": "PRT",
        "LO": "SVK",
        "PL": "POL",
        "EN": "EST",
        "HU": "HUN",
        "LH": "LTU",
        "LG": "LVA",
        "MX": "MEX",
        "TU": "TUR",
        "BR": "BRA",
        "RO": "ROU",
        "BU": "BGR",
        "CH": "CHN",
        "ID": "IDN",
        "IN": "IND",
        "RS": "RUS",
        "FI": "FIN",
        "IT": "ITA",
    }

    # this file spec is documented in the xlsx file from the archive
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    country = country_fips[fips]
                    table.insert([year, country, "pop", int(fields[2])])

    # worldbank data has some deflator data that imf doesn't
    worldbank = {
        "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls",
        #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls",
        #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls",
        "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls",
    }

    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in config.countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    if type(value) is float and value != 0:
                        table.insert([year, country, indicator, value])

    imf_fields = (
        "LP",  # population
        "PPPPC",  # ppp per capita
        "NGDPRPC",  # gdp per capita in constant prices
        "NGDP_D",  # gdp deflator
    )

    # this is actually a csv file despite what it's called
    path = fileutils.getcache("WEOApr2012all.xls", "wsupp")

    with codecs.open(path, "r", "cp1252") as fh:
        csvf = csv.reader(fh, dialect=csv.excel_tab)
        header = next(csvf)
        year_cols = {}

        valid_year = re.compile("\d{4}")
        valid_float = re.compile("-*[\d\.,]+")

        for i in range(len(header)):
            if header[i] == "ISO":
                country_col = i
            elif header[i] == "WEO Subject Code":
                subject_col = i
            elif valid_year.match(header[i]):
                year_cols[int(header[i])] = i
            elif header[i] == "Estimates Start After":
                last_year_col = i

        for row in csvf:
            if len(row) > subject_col and row[subject_col] in imf_fields:
                field = row[subject_col]
                country = row[country_col]
                if country not in config.countries:
                    continue
                if valid_year.match(row[last_year_col]):
                    last_year = int(row[last_year_col])
                else:
                    # not clear if this means all values are estimated
                    last_year = 9999
                for (year, colnum) in year_cols.items():
                    value = row[colnum]
                    if valid_float.match(value):  #and year < last_year:
                        table.insert([
                            year, country, field,
                            float(value.replace(",", ""))
                        ])
Beispiel #21
0
class SectorCodes:
    def __init__(self, codetablename, mode="r"):
        self.mode = mode

        self.codetable = SQLTable(codetablename, ["code", "description"],
                                  ["varchar(15)", "varchar(255)"])

        self.code_dict = {}
        self.reverse_code_dict = {}

        self.setup()

    def setup(self):
        if self.mode == "w":
            # invalid codes or codes that we don't want to record
            self.code_blacklist = []

            # if we want to override the code provided with something
            # we make up (or from another set) based on the description
            self.manual_codes = {}

            self.codetable.create()

        # get existing codes from db
        for (code, desc) in self.codetable.getall():
            self.code_dict[code] = desc
            self.reverse_code_dict[desc] = code

        return self

    # for write mode
    def blacklist_code(self, code):
        self.code_blacklist.append(code)

        if code in self.code_dict:
            del self.code_dict[code]

    def set_blacklist(self, code_blacklist):
        self.code_blacklist = []
        for code in code_blacklist:
            self.blacklist_code(code)

    def curate_code_from_desc(self, desc, code):
        self.manual_codes[desc] = code

        self.code_dict[code] = desc
        self.reverse_code_dict[desc] = code

    def add_curated_codes(self, curated_codes):
        for (desc, code) in curated_codes.items():
            self.curate_code_from_desc(desc, code)

    # returns the code used if it was recognized, false otherwise
    def set_code(self, code, desc):
        if type(code) is str:
            code = code.strip()
        elif type(code) is float:
            code = str(int(code))

        if type(desc) is str:
            desc = desc.strip()

        if desc in self.manual_codes:
            code = self.manual_codes[desc]

        if code is None or not len(code):
            if desc is None or not len(desc):  # ignore empty args
                return False
            else:
                return False
        elif code in self.code_blacklist:
            return False

        if code in self.code_dict and self.code_dict[code] != desc:
            # this is to check for blatant differences
            print(self.code_dict[code], "=>", desc)
        self.code_dict[code] = desc

        # there may be more than one description for the same code
        self.reverse_code_dict[desc] = code

        return code

    def has_code(self, code):
        return code in self.code_dict

    def get_code_for_title(self, desc):
        if desc in self.reverse_code_dict:
            return self.reverse_code_dict[desc]

    def get_title_for_code(self, code):
        if self.has_code(code):
            return self.code_dict[code]
        return False

    def update_codes(self):
        if self.mode != "w":
            raise Exception("SectorCodes created in read-only mode")

        self.codetable.truncate()
        for code in sorted(self.code_dict.keys()):
            desc = self.code_dict[code]
            self.codetable.insert([code, desc])
Beispiel #22
0
def doparse():

    tablename = "%s.world_supplement" % config.WIOD_SCHEMA
    table = SQLTable(tablename,
                     ["year", "country", "measurement", "value"],
                     ["int", "char(3)", "varchar(8)", "float"])
    table.create()
    table.truncate()

    # census data has more complete population counts
    country_fips = {
        "LU": "LUX", "US": "USA", "NL": "NLD", "AU": "AUT", "SW": "SWE",
        "CA": "CAN", "AS": "AUS", "EI": "IRL", "GM": "DEU", "BE": "BEL",
        "TW": "TWN", "DA": "DNK", "UK": "GBR", "FR": "FRA", "JA": "JPN",
        "KS": "KOR", "SP": "ESP", "CY": "CYP", "SI": "SVN", "EZ": "CZE",
        "GR": "GRC", "MT": "MLT", "PO": "PRT", "LO": "SVK", "PL": "POL",
        "EN": "EST", "HU": "HUN", "LH": "LTU", "LG": "LVA", "MX": "MEX",
        "TU": "TUR", "BR": "BRA", "RO": "ROU", "BU": "BGR", "CH": "CHN",
        "ID": "IDN", "IN": "IND", "RS": "RUS", "FI": "FIN", "IT": "ITA",
        }
    
    # this file spec is documented in the xlsx file from the archive
    path = fileutils.getcache("IDBext001.txt", "wsupp")
    with open(path, "r") as fh:
        for line in fh:
            fields = line.split("|")
            if len(fields) == 3:
                fips = fields[0]
                if fips in country_fips:
                    year = int(fields[1])
                    country = country_fips[fips]
                    table.insert([year, country, "pop", int(fields[2])])

    # worldbank data has some deflator data that imf doesn't
    worldbank = {
        "ppp_pc": "NY.GDP.PCAP.PP.KD_Indicator_MetaData_en_EXCEL.xls",
        #"gdp_pc": "NY.GDP.PCAP.CD_Indicator_MetaData_en_EXCEL.xls",
        #"dec": "PA.NUS.ATLS_Indicator_MetaData_en_EXCEL.xls",
        #"pppratio": "PA.NUS.PPPC.RF_Indicator_MetaData_en_EXCEL.xls",
        "deflator": "NY.GDP.DEFL.ZS_Indicator_MetaData_en_EXCEL.xls",
        }
    
    for (indicator, filename) in worldbank.items():
        path = fileutils.getcache(filename, "wsupp")
        wb = xlrd.open_workbook(path)
        sheet = wb.sheet_by_index(0)
        header = [int(x) for x in sheet.row_values(0)[2:]]
        for i in range(1, sheet.nrows):
            row = sheet.row_values(i)
            if row[1] in config.countries:
                country = row[1]
                for (year, value) in zip(header, row[2:]):
                    if type(value) is float and value != 0:
                        table.insert([year, country, indicator, value])

    imf_fields = (
        "LP", # population
        "PPPPC", # ppp per capita
        "NGDPRPC", # gdp per capita in constant prices
        "NGDP_D", # gdp deflator
        )

    # this is actually a csv file despite what it's called
    path = fileutils.getcache("WEOApr2012all.xls", "wsupp")

    with codecs.open(path, "r", "cp1252") as fh:
        csvf = csv.reader(fh, dialect=csv.excel_tab)
        header = next(csvf)
        year_cols = {}

        valid_year = re.compile("\d{4}")
        valid_float = re.compile("-*[\d\.,]+")

        for i in range(len(header)):
            if header[i] == "ISO":
                country_col = i
            elif header[i] == "WEO Subject Code":
                subject_col = i
            elif valid_year.match(header[i]):
                year_cols[int(header[i])] = i
            elif header[i] == "Estimates Start After":
                last_year_col = i

        for row in csvf:
            if len(row) > subject_col and row[subject_col] in imf_fields:
                field = row[subject_col]
                country = row[country_col]
                if country not in config.countries:
                    continue
                if valid_year.match(row[last_year_col]):
                    last_year = int(row[last_year_col])
                else:
                    # not clear if this means all values are estimated
                    last_year = 9999
                for (year, colnum) in year_cols.items():
                    value = row[colnum]
                    if valid_float.match(value): #and year < last_year:
                        table.insert([year, country, field,
                                      float(value.replace(",", ""))])
Beispiel #23
0
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001
    
    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
            },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
            },
        "waste_pol": {
            "filename": "IO_waste.xls",
            "columns": ["Total waste", "General waste",
                        "Hazardous waste", "Total waste - improper disposal",
                        "General waste - improper disposal",
                        "Hazardous waste - improper disposal"],
            },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
            },
        }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename,
                             ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]
    
        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file: # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i+5]])
    
    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files
    
    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
        }
    
    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename,
                         ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()
        
        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)
    
            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range (1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])
    
                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True