Ejemplo n.º 1
0
Archivo: parser.py Proyecto: sonya/eea
def parse_io():
    files = {
        2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"),
        2007: fileutils.getdatapath(
            "0101.xls", "cn-io", "中国投入产出表2007", "excel"),
        }

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        tables.add_io_table(year)
        codes = tables.new_sector_codes(year)

        wb = xlrd.open_workbook(filename)
        # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB
        sheet = wb.sheet_by_index(0)

        ind_codes = None

        # the excel files also have this evil problem of merging
        # cells for appearance and not meaning.  we only have 2
        # years so curate them
        codes.set_code("FU101", "农村居民消费")
        codes.set_code("FU102", "城镇居民消费")
        codes.set_code("FU103", "政府消费支出")
        codes.set_code("FU201", "固定资本形成总额")
        codes.set_code("FU202", "存货增加")
        codes.set_code("GCF", "资本形成合计")
        codes.set_code("EX", "出口")

        codes.blacklist_code("TI")
        codes.blacklist_code("TII")

        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if ind_codes is None:
                for cell in row:
                    if type(cell) is str and cell.strip("0") == "1":
                        ind_codes = []
                        break
                if ind_codes is not None:
                    for cell in row[3:]:
                        if type(cell) is float:
                            cell = str(int(cell))
                        if regexes.is_num(cell) or table.has_code(cell):
                            ind_codes.append(cell)
                        else:
                            ind_codes.append(None)
            else:
                from_code = codes.set_code(row[2], row[1])
                if from_code:
                    for (value, to_code) in zip(row[3:], ind_codes):
                        if to_code is not None:
                            tables.insert_io(year, from_code, to_code, value)

        codes.update_codes()
Ejemplo n.º 2
0
Archivo: parser.py Proyecto: sonya/eea
def parse_io():
    files = {
        2005: fileutils.getdatapath("2005年42部门投入产出流量表.xls", "cn-io"),
        2007: fileutils.getdatapath("0101.xls", "cn-io", "中国投入产出表2007",
                                    "excel"),
    }

    tables = HybridTableCreator(config.SCHEMA)

    for (year, filename) in files.items():
        tables.add_io_table(year)
        codes = tables.new_sector_codes(year)

        wb = xlrd.open_workbook(filename)
        # in 2005 sheet 0 is x10k RMB, 2007 has only 1 sheet @x10k RMB
        sheet = wb.sheet_by_index(0)

        ind_codes = None

        # the excel files also have this evil problem of merging
        # cells for appearance and not meaning.  we only have 2
        # years so curate them
        codes.set_code("FU101", "农村居民消费")
        codes.set_code("FU102", "城镇居民消费")
        codes.set_code("FU103", "政府消费支出")
        codes.set_code("FU201", "固定资本形成总额")
        codes.set_code("FU202", "存货增加")
        codes.set_code("GCF", "资本形成合计")
        codes.set_code("EX", "出口")

        codes.blacklist_code("TI")
        codes.blacklist_code("TII")

        for i in range(sheet.nrows):
            row = sheet.row_values(i)
            if ind_codes is None:
                for cell in row:
                    if type(cell) is str and cell.strip("0") == "1":
                        ind_codes = []
                        break
                if ind_codes is not None:
                    for cell in row[3:]:
                        if type(cell) is float:
                            cell = str(int(cell))
                        if regexes.is_num(cell) or table.has_code(cell):
                            ind_codes.append(cell)
                        else:
                            ind_codes.append(None)
            else:
                from_code = codes.set_code(row[2], row[1])
                if from_code:
                    for (value, to_code) in zip(row[3:], ind_codes):
                        if to_code is not None:
                            tables.insert_io(year, from_code, to_code, value)

        codes.update_codes()
Ejemplo n.º 3
0
Archivo: parser.py Proyecto: sonya/eea
def parse_codes():
    comcodes = parserutils.add_tracker("%s.com_codes" % config.SCHEMA, "w")
    filename = fileutils.getdatapath("commodities.csv", "ca")
    with open(filename, "r") as fh:
        csvf = csv.reader(fh)
        for row in csvf:
            if len(row) and regexes.is_num(row[0]):
                comcodes.set_code(row[0], row[1])
    comcodes.update_codes()

    maptable = SQLTable("%s.sector_map" % config.SCHEMA,
                        ["io_code", "env_code", "harmonized"],
                        ["varchar(15)", "varchar(15)", "varchar(15)"]).create()
    
    indcodes = parserutils.add_tracker("%s.ind_codes" % config.SCHEMA, "w")
    filename = fileutils.getdatapath("industries.csv", "ca")
    with open(filename, "r") as fh:
        csvf = csv.reader(fh)
        for row in csvf:
            if len(row) >= 5:
                io_code = row[0]
                if not len(io_code):
                    io_code = None
                elif len(row[1]):
                    indcodes.set_code(io_code, row[1])

                env_code = row[2]
                if not len(env_code):
                    env_code = None
                elif len(row[3]):
                    indcodes.set_code(env_code, row[3])

                harmonized = row[4]
                if len(harmonized) and regexes.is_num(harmonized):
                    indcodes.set_code(harmonized, row[5])
                    maptable.insert([io_code, env_code, harmonized])

    indcodes.update_codes()
Ejemplo n.º 4
0
Archivo: parser.py Proyecto: sonya/eea
def parse_ixi_year(tables, codes, workbook, year):
    tables.add_io_table(year)

    # parse intermediate demand
    sheet = workbook.sheet_by_name("Table 2 - Int Con %d" % year)
    temp_ind_codes = None
    ind_codes = []
    ind_names = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 3:
            continue
        if temp_ind_codes is None:
            if type(row[2]) is float or regexes.is_num(row[2]):
                temp_ind_codes = row
        elif ind_names is None:
            ind_names = row
            for (code, name) in zip(temp_ind_codes, ind_names):
                ind_codes.append(codes.set_code(code, name))
        else:
            from_code = codes.set_code(row[0], row[1])
            if from_code:
                for i in range(2, len(row)):
                    tables.insert_io(year, from_code, ind_codes[i], row[i])

    # parse final demand
    sheet = workbook.sheet_by_name("Table 2 - Final Demand %d" % year)
    fd_codes = []
    fd_names = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 3:
            continue
        if fd_names is None:
            if row[1].strip() == "Product":
                fd_names = row
                for name in fd_names:
                    fd_codes.append(codes.set_code(None, name))
        else:
            from_code = codes.set_code(row[0], row[1])
            if from_code:
                for i in range(2, len(row)):
                    tables.insert_io(year, from_code, fd_codes[i], row[i])
Ejemplo n.º 5
0
Archivo: parser.py Proyecto: sonya/eea
def parse_ixi_year(tables, codes, workbook, year):
    tables.add_io_table(year)

    # parse intermediate demand
    sheet = workbook.sheet_by_name("Table 2 - Int Con %d" % year)
    temp_ind_codes = None
    ind_codes = []
    ind_names = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 3:
            continue
        if temp_ind_codes is None:
            if type(row[2]) is float or regexes.is_num(row[2]):
                temp_ind_codes = row
        elif ind_names is None:
            ind_names = row
            for (code, name) in zip(temp_ind_codes, ind_names):
                ind_codes.append(codes.set_code(code, name))
        else:
            from_code = codes.set_code(row[0], row[1])
            if from_code:
                for i in range(2, len(row)):
                    tables.insert_io(year, from_code, ind_codes[i], row[i])

    # parse final demand
    sheet = workbook.sheet_by_name("Table 2 - Final Demand %d" % year)
    fd_codes = []
    fd_names = None
    for i in range(sheet.nrows):
        row = sheet.row_values(i)
        if len(row) < 3:
            continue
        if fd_names is None:
            if row[1].strip() == "Product":
                fd_names = row
                for name in fd_names:
                    fd_codes.append(codes.set_code(None, name))
        else:
            from_code = codes.set_code(row[0], row[1])
            if from_code:
                for i in range(2, len(row)):
                    tables.insert_io(year, from_code, fd_codes[i], row[i])
Ejemplo n.º 6
0
Archivo: parser.py Proyecto: sonya/eea
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001

    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
        },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
        },
        "waste_pol": {
            "filename":
            "IO_waste.xls",
            "columns": [
                "Total waste", "General waste", "Hazardous waste",
                "Total waste - improper disposal",
                "General waste - improper disposal",
                "Hazardous waste - improper disposal"
            ],
        },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
        },
    }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename, ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]

        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file:  # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i + 5]])

    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files

    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
    }

    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename, ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()

        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)

            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range(1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])

                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True
Ejemplo n.º 7
0
Archivo: parser.py Proyecto: sonya/eea
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue
    
        db_table = SQLTable("cn.emissions_%d" % year,
                            ["industry_zh", "industry_en",
                             "pollutant", "amount"],
                            ["varchar(1023)", "varchar(1023)",
                             "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()
    
        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]
    
            for (pollutant, amount) in zip(columns[max_sector_column+1:],
                                           rowdata[max_sector_column+1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])
    
        xact = db.xact(mode="READ WRITE")
        xact.begin()
    
        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb") # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)
    
            print(adir, filename)
            title = soup.title.string
    
            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break
    
            columns = None
            did_have_numbers = False # true after we've parsed through
            max_sector_column = 0 # 1 if english separate, 0 otherwise
    
            prev_rowdata = None
            prev_rowspans = None
            data = []
    
            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None
    
            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue
    
                rowspans = []
                rowdata = []
    
                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None
    
                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)
    
                for cellpos in range(rowlen):
                    cell = cells[cellpos]
    
                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])
    
                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')
    
                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]
    
                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(-abs(
                                    abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)
    
                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)
    
                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan: # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)
    
                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()
    
                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans
    
                if len(rowdata) == 0:
                    continue
    
                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue
    
                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0: # all blank strings
                    continue
    
                # if we're sure we have columns, clean up rowdata so 
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1]*len(rowspans)
    
                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break
    
                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i-1] + " - "
    
                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]
    
                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])
    
                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None
    
                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")
    
                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1
    
                elif insert_now is not None and len(insert_now) == len(columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)
    
            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)
    
            print(columns)
    
        xact.commit()
Ejemplo n.º 8
0
Archivo: parser.py Proyecto: sonya/eea
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue

        db_table = SQLTable(
            "cn.emissions_%d" % year,
            ["industry_zh", "industry_en", "pollutant", "amount"],
            ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()

        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]

            for (pollutant, amount) in zip(columns[max_sector_column + 1:],
                                           rowdata[max_sector_column + 1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])

        xact = db.xact(mode="READ WRITE")
        xact.begin()

        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb")  # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)

            print(adir, filename)
            title = soup.title.string

            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break

            columns = None
            did_have_numbers = False  # true after we've parsed through
            max_sector_column = 0  # 1 if english separate, 0 otherwise

            prev_rowdata = None
            prev_rowspans = None
            data = []

            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None

            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue

                rowspans = []
                rowdata = []

                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None

                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)

                for cellpos in range(rowlen):
                    cell = cells[cellpos]

                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])

                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')

                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]

                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(
                                -abs(abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)

                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)

                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan:  # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)

                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()

                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans

                if len(rowdata) == 0:
                    continue

                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue

                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0:  # all blank strings
                    continue

                # if we're sure we have columns, clean up rowdata so
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1] * len(rowspans)

                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break

                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i - 1] + " - "

                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]

                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])

                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None

                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")

                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1

                elif insert_now is not None and len(insert_now) == len(
                        columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)

            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)

            print(columns)

        xact.commit()
Ejemplo n.º 9
0
Archivo: parser.py Proyecto: sonya/eea
def parse_env():
    # parse english env files
    # TODO: might want to use the energy table as well.
    # it is very comprehensive, but formatted differently and only has 2001
    
    sector_whitelist = ("Household Consumption", "Fixed Capital Formation")
    eng_env_years = [1999, 2001, 2004]
    eng_env_files = {
        "air_pol": {
            "filename": "IO_air.xls",
            "columns": ["TSP", "PM10", "SOx", "NOx", "NMHC", "CO", "Pb"],
            },
        "water_pol": {
            "filename": "IO_pol_water.xls",
            "columns": ["BOD", "COD", "SS"],
            },
        "waste_pol": {
            "filename": "IO_waste.xls",
            "columns": ["Total waste", "General waste",
                        "Hazardous waste", "Total waste - improper disposal",
                        "General waste - improper disposal",
                        "Hazardous waste - improper disposal"],
            },
        "water_use": {
            "filename": "IO_res_water.xls",
            "columns": ["Natural water", "Abstracted water"],
            },
        }

    tables_by_year = {}
    for year in eng_env_years:
        if year not in tables_by_year:
            tablename = "%s.env_%d" % (config.SCHEMA, year)
            table = SQLTable(tablename,
                             ["sector", "series", "value"],
                             ["varchar(55)", "varchar(255)", "float"])
            table.create()
            table.truncate()
            tables_by_year[year] = table
        else:
            table = tables_by_year[year]
    
        first_file = True
        for (tkey, tdata) in eng_env_files.items():
            path = fileutils.getdatapath(tdata["filename"], "tw-env")
            wb = xlrd.open_workbook(path)
            sheet = wb.sheet_by_name("year %d" % year)
            for rowindex in range(sheet.nrows):
                row = sheet.row_values(rowindex)
                if len(row) > 1 and \
                        (regexes.is_num(row[0]) or row[1] in sector_whitelist):
                    sector = row[1].strip()
                    if first_file: # these columns are repeated in every file
                        table.insert([sector, "Total Output", row[2]])
                        table.insert([sector, "Total Input", row[3]])
                        table.insert([sector, "GDP", row[4]])
                        first_file = False
                    for i in range(len(tdata["columns"])):
                        table.insert([sector, tdata["columns"][i], row[i+5]])
    
    # parse chinese env tables
    # this is file that we created by compiling older chinse data and
    # manually copying info from latest (2010) pdf files
    
    # skip 2001 because the english version is better
    sheetnames_by_year = {
        2000: ["89年空汙", "89年廢棄物"],
        2002: ["91年空汙", "91年廢棄物"],
        2003: ["92年空汙", "92年廢棄物"],
        2010: ["99年空汙", "99年水汙", "99年廢棄物"],
        }
    
    path = fileutils.getdatapath("sheets.xls", "tw-env")
    wb = xlrd.open_workbook(path)
    for (year, sheetnames) in sheetnames_by_year.items():
        tablename = "%s.env_%d" % (config.SCHEMA, year)
        table = SQLTable(tablename,
                         ["sector", "series", "value"],
                         ["varchar(55)", "varchar(255)", "float"])
        table.create()
        table.truncate()
        
        for sheetname in sheetnames:
            sheet = wb.sheet_by_name(sheetname)
            header = sheet.row_values(0)
    
            # the 2010 tables have several rows that we don't want
            should_parse = (year != 2010)
            for i in range(1, sheet.nrows):
                row = sheet.row_values(i)
                if should_parse:
                    sector = row[0].strip()
                    for i in range (1, len(header)):
                        measurement = header[i].strip()
                        value = row[i]
                        table.insert([sector, measurement, value])
    
                elif row[0] in ("依行業分", "依部門分"):
                    should_parse = True