Example #1
0
def parse_tables():
    files = fileutils.getcachecontents("io-annual")
    for filename in files:
        path = fileutils.getcache(filename, "io-annual")
        print(path)
        table = CSVTable(path, False)

        make_year = is_make(filename)
        use_year = is_use(filename)
        if make_year:
            table.create_sql_table(
                "%s.annual_make_%s" % (config.IO_SCHEMA, make_year),
                ["industry", "commodity", "value"],
                ["varchar(6)", "varchar(6)", "float"])

        elif use_year:
            table.create_sql_table(
                "%s.annual_use_%s" % (config.IO_SCHEMA, use_year),
                ["commodity", "industry", "value"],
                ["varchar(6)", "varchar(6)", "float"])

        elif filename == "codes.csv":
            table.create_sql_table(
                "%s.annual_codes" % config.IO_SCHEMA,
                ["code", "description"],
                ["varchar(6)", "text"])

        else:
            continue

        table.parse_to_sql()
Example #2
0
def create_views():
    years = []
    files = fileutils.getcachecontents("io-annual")
    for filename in files:
        year = is_make(filename)
        if year:
            years.append(year)

    for year in years:
        strings = {
            "make_table": "%s.annual_make_%s" % (config.IO_SCHEMA, year),
            "use_table": "%s.annual_use_%s" % (config.IO_SCHEMA, year),
            "cxc_table": "%s.annual_cxc_%s" % (config.IO_SCHEMA, year),
            }

        db.execute("DROP TABLE %(cxc_table)s" % strings)

        db.execute("""SELECT from_sector, to_sector, SUM(value) AS value
          INTO %(cxc_table)s
          FROM (SELECT use.commodity AS from_sector,
                       indshare.commodity AS to_sector,
                       use.value * indshare.output_share AS value
                  FROM (SELECT make.industry,
                               make.commodity,
                               make.value / indtotal.value AS output_share
                          FROM %(make_table)s make,
                               (SELECT industry, SUM(value) AS value
                                  FROM %(make_table)s
                                 GROUP BY industry) indtotal
                         WHERE make.industry = indtotal.industry) indshare,
                       %(use_table)s use
                 WHERE indshare.industry = use.industry
              UNION
                SELECT use.commodity AS from_sector,
                       use.industry AS to_sector,
                       use.value AS value
                  FROM %(use_table)s use
                 WHERE industry NOT IN (SELECT industry
                                          FROM %(make_table)s make
                                         WHERE commodity = 'TIO')
               ) allocations
         GROUP BY from_sector, to_sector""" % strings)
Example #3
0
File: parser.py Project: sonya/eea
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue
    
        db_table = SQLTable("cn.emissions_%d" % year,
                            ["industry_zh", "industry_en",
                             "pollutant", "amount"],
                            ["varchar(1023)", "varchar(1023)",
                             "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()
    
        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]
    
            for (pollutant, amount) in zip(columns[max_sector_column+1:],
                                           rowdata[max_sector_column+1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])
    
        xact = db.xact(mode="READ WRITE")
        xact.begin()
    
        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb") # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)
    
            print(adir, filename)
            title = soup.title.string
    
            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break
    
            columns = None
            did_have_numbers = False # true after we've parsed through
            max_sector_column = 0 # 1 if english separate, 0 otherwise
    
            prev_rowdata = None
            prev_rowspans = None
            data = []
    
            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None
    
            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue
    
                rowspans = []
                rowdata = []
    
                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None
    
                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)
    
                for cellpos in range(rowlen):
                    cell = cells[cellpos]
    
                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])
    
                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')
    
                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]
    
                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(-abs(
                                    abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)
    
                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)
    
                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan: # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)
    
                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()
    
                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans
    
                if len(rowdata) == 0:
                    continue
    
                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue
    
                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0: # all blank strings
                    continue
    
                # if we're sure we have columns, clean up rowdata so 
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1]*len(rowspans)
    
                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break
    
                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i-1] + " - "
    
                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]
    
                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])
    
                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None
    
                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")
    
                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1
    
                elif insert_now is not None and len(insert_now) == len(columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)
    
            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)
    
            print(columns)
    
        xact.commit()
Example #4
0
File: parser.py Project: sonya/eea
def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue

        db_table = SQLTable(
            "cn.emissions_%d" % year,
            ["industry_zh", "industry_en", "pollutant", "amount"],
            ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()

        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]

            for (pollutant, amount) in zip(columns[max_sector_column + 1:],
                                           rowdata[max_sector_column + 1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])

        xact = db.xact(mode="READ WRITE")
        xact.begin()

        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb")  # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)

            print(adir, filename)
            title = soup.title.string

            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break

            columns = None
            did_have_numbers = False  # true after we've parsed through
            max_sector_column = 0  # 1 if english separate, 0 otherwise

            prev_rowdata = None
            prev_rowspans = None
            data = []

            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None

            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue

                rowspans = []
                rowdata = []

                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None

                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)

                for cellpos in range(rowlen):
                    cell = cells[cellpos]

                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])

                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')

                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]

                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(
                                -abs(abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)

                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)

                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan:  # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)

                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()

                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans

                if len(rowdata) == 0:
                    continue

                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue

                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0:  # all blank strings
                    continue

                # if we're sure we have columns, clean up rowdata so
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1] * len(rowspans)

                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break

                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i - 1] + " - "

                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]

                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])

                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None

                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")

                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1

                elif insert_now is not None and len(insert_now) == len(
                        columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)

            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)

            print(columns)

        xact.commit()