Python getcachecontents Examples

Programming Language: Python

Namespace/Package Name: common.fileutils

Method/Function: getcachecontents

Examples at hotexamples.com: 4

Python getcachecontents - 4 examples found. These are the top rated real world Python examples of common.fileutils.getcachecontents extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def parse_tables():
    files = fileutils.getcachecontents("io-annual")
    for filename in files:
        path = fileutils.getcache(filename, "io-annual")
        print(path)
        table = CSVTable(path, False)

        make_year = is_make(filename)
        use_year = is_use(filename)
        if make_year:
            table.create_sql_table(
                "%s.annual_make_%s" % (config.IO_SCHEMA, make_year),
                ["industry", "commodity", "value"],
                ["varchar(6)", "varchar(6)", "float"])

        elif use_year:
            table.create_sql_table(
                "%s.annual_use_%s" % (config.IO_SCHEMA, use_year),
                ["commodity", "industry", "value"],
                ["varchar(6)", "varchar(6)", "float"])

        elif filename == "codes.csv":
            table.create_sql_table(
                "%s.annual_codes" % config.IO_SCHEMA,
                ["code", "description"],
                ["varchar(6)", "text"])

        else:
            continue

        table.parse_to_sql()

Example #2

Show file

def create_views():
    years = []
    files = fileutils.getcachecontents("io-annual")
    for filename in files:
        year = is_make(filename)
        if year:
            years.append(year)

    for year in years:
        strings = {
            "make_table": "%s.annual_make_%s" % (config.IO_SCHEMA, year),
            "use_table": "%s.annual_use_%s" % (config.IO_SCHEMA, year),
            "cxc_table": "%s.annual_cxc_%s" % (config.IO_SCHEMA, year),
            }

        db.execute("DROP TABLE %(cxc_table)s" % strings)

        db.execute("""SELECT from_sector, to_sector, SUM(value) AS value
          INTO %(cxc_table)s
          FROM (SELECT use.commodity AS from_sector,
                       indshare.commodity AS to_sector,
                       use.value * indshare.output_share AS value
                  FROM (SELECT make.industry,
                               make.commodity,
                               make.value / indtotal.value AS output_share
                          FROM %(make_table)s make,
                               (SELECT industry, SUM(value) AS value
                                  FROM %(make_table)s
                                 GROUP BY industry) indtotal
                         WHERE make.industry = indtotal.industry) indshare,
                       %(use_table)s use
                 WHERE indshare.industry = use.industry
              UNION
                SELECT use.commodity AS from_sector,
                       use.industry AS to_sector,
                       use.value AS value
                  FROM %(use_table)s use
                 WHERE industry NOT IN (SELECT industry
                                          FROM %(make_table)s make
                                         WHERE commodity = 'TIO')
               ) allocations
         GROUP BY from_sector, to_sector""" % strings)

Example #3

Show file

File: parser.py Project: sonya/eea

def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue
    
        db_table = SQLTable("cn.emissions_%d" % year,
                            ["industry_zh", "industry_en",
                             "pollutant", "amount"],
                            ["varchar(1023)", "varchar(1023)",
                             "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()
    
        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]
    
            for (pollutant, amount) in zip(columns[max_sector_column+1:],
                                           rowdata[max_sector_column+1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])
    
        xact = db.xact(mode="READ WRITE")
        xact.begin()
    
        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb") # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)
    
            print(adir, filename)
            title = soup.title.string
    
            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break
    
            columns = None
            did_have_numbers = False # true after we've parsed through
            max_sector_column = 0 # 1 if english separate, 0 otherwise
    
            prev_rowdata = None
            prev_rowspans = None
            data = []
    
            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None
    
            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue
    
                rowspans = []
                rowdata = []
    
                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None
    
                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)
    
                for cellpos in range(rowlen):
                    cell = cells[cellpos]
    
                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])
    
                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')
    
                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]
    
                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(-abs(
                                    abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)
    
                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)
    
                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan: # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)
    
                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()
    
                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans
    
                if len(rowdata) == 0:
                    continue
    
                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue
    
                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0: # all blank strings
                    continue
    
                # if we're sure we have columns, clean up rowdata so 
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1]*len(rowspans)
    
                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break
    
                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i-1] + " - "
    
                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]
    
                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])
    
                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None
    
                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")
    
                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1
    
                elif insert_now is not None and len(insert_now) == len(columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)
    
            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)
    
            print(columns)
    
        xact.commit()

Example #4

Show file

File: parser.py Project: sonya/eea

def parse_env():
    cache_dirs = fileutils.getcachecontents("cn")

    for adir in cache_dirs:
        if regexes.is_num(adir):
            year = int(adir)
        else:
            continue

        db_table = SQLTable(
            "cn.emissions_%d" % year,
            ["industry_zh", "industry_en", "pollutant", "amount"],
            ["varchar(1023)", "varchar(1023)", "varchar(1023)", "float"])
        db_table.drop()
        db_table.create()

        def insert_row(rowdata, columns, max_sector_column):
            if max_sector_column == 0:
                (ind_zh, ind_en) = split_english(rowdata[0])
            else:
                ind_zh = rowdata[0]
                ind_en = rowdata[1]

            for (pollutant, amount) in zip(columns[max_sector_column + 1:],
                                           rowdata[max_sector_column + 1:]):
                if (len(amount)):
                    db_table.insert([ind_zh, ind_en, pollutant, amount])

        xact = db.xact(mode="READ WRITE")
        xact.begin()

        subdir = os.path.join("cn", adir)
        files = fileutils.getcachecontents(subdir)
        for filename in files:
            filepath = fileutils.getcache(filename, subdir)
            fh = open(filepath, "rb")  # binary b/c of non-utf encoding
            html = fh.read()
            fh.close()
            soup = BeautifulSoup(html)

            print(adir, filename)
            title = soup.title.string

            # mad maaad nested tables!
            # we'll just have to find one with a large number of rows
            # and hope that's the right one
            table = None
            for test_table in soup.find_all("table"):
                if test_table.tbody:
                    test_table = test_table.tbody
                num_rows = len(list(filter(is_row, test_table.children)))
                if num_rows > 10:
                    table = test_table
                    break

            columns = None
            did_have_numbers = False  # true after we've parsed through
            max_sector_column = 0  # 1 if english separate, 0 otherwise

            prev_rowdata = None
            prev_rowspans = None
            data = []

            # long cell values are often expanded into the cell directly
            # below (multiple rows) resulting in rows that are blank
            # except in cells that contain overflow.
            # this necessitates to keep state using heuristics.
            insert_later = None
            insert_now = None

            for row in table.children:
                if not is_tag(row) or row.name != "tr":
                    continue

                rowspans = []
                rowdata = []

                # multi-row cells precede sub-parts of the pollutant
                # which can't be distinguished without their parent
                prefix = None

                cells = list(filter(is_cell, row.children))
                rowlen = len(cells)

                for cellpos in range(rowlen):
                    cell = cells[cellpos]

                    rowspan = 1
                    if "rowspan" in cell.attrs:
                        rowspan = int(cell["rowspan"])

                    cellvalue = cell.text.strip().strip(".")\
                        .replace('…', '').replace('\xa0', '')

                    # use previous rowspan if we have one of the buggy blank
                    # cells at the end, which don't have the proper rowspan
                    if cellpos == rowlen - 1 and \
                            len(cellvalue) == 0 and len(rowspans) > 0:
                        rowspan = rowspans[-1]

                    # if the cell directly before us in the previous row
                    # spanned multiple rows, create a blank space in this row.
                    # the abs difference below is used for counting down:
                    # if rowspan in previous column was 6 and current is 1
                    # the difference is -5, on the next row that will
                    # be subtracted again
                    if prev_rowspans is not None:
                        i = len(rowdata)
                        while i < len(prev_rowspans) and \
                                abs(prev_rowspans[i]) > rowspan:
                            rowdata.append('')
                            rowspans.append(
                                -abs(abs(rowspan) - abs(prev_rowspans[i])))
                            i = len(rowdata)

                    rowdata.append(cellvalue)
                    rowspans.append(rowspan)

                # count any multi-row cells that were at the end
                if prev_rowdata is not None:
                    for i in range(len(rowdata), len(prev_rowdata)):
                        if prev_rowspans[i] > rowspan:  # span of last cell
                            rowdata.append(prev_rowdata[i])
                            rowspans.append(rowspan)

                # remove blank cells at the end - these appear to be bugs
                while len(rowdata) and len(rowdata[-1]) == 0 and \
                        (columns is None or len(rowdata) != len(columns)):
                    rowdata.pop()
                    rowspans.pop()

                # end of rowdata manipulation
                prev_rowdata = rowdata
                prev_rowspans = rowspans

                if len(rowdata) == 0:
                    continue

                # ignore rows that they put above the column headers
                # we'll just special case anything we find
                if columns is None and rowdata[0].startswith("单位"):
                    prev_rowdata = None
                    prev_rowspans = None
                    continue

                lengths = [len(x) for x in rowdata]
                if sum(lengths) == 0:  # all blank strings
                    continue

                # if we're sure we have columns, clean up rowdata so
                # the multirow rules don't get applied anymore
                if sum(rowspans) == rowspan * len(rowspans):
                    rowspans = [1] * len(rowspans)

                has_numbers = False
                for field in rowdata:
                    if regexes.is_num(field):
                        has_numbers = True
                        did_have_numbers = True
                        break

                if has_numbers or insert_later is None:
                    insert_now = insert_later
                    insert_later = rowdata
                else:
                    # decide whether this row is an overflow
                    # already know sum(lengths) > 0
                    if len(rowdata) >= len(insert_later) and \
                            (lengths[0] == 0 or lengths[-1] == 0):
                        # we shouldn't see overflow on both sides
                        # because rowdata[0] should happen in a header row
                        # and rowdata[-1] must happen in a data row
                        for i in range(len(insert_later)):
                            # don't want to append to "hang ye" or "Sector"
                            if not did_have_numbers \
                                    and i > max_sector_column + 1 \
                                    and len(insert_later[i]) == 0:
                                # blank above, assume "multirow" to the left
                                insert_later[i] = insert_later[i - 1] + " - "

                            if lengths[i]:
                                insert_later[i] += " " + rowdata[i]

                    # if we knocked blank cells off the previous row but
                    # we know it's actually longer from the current row
                    for i in range(len(insert_later), len(rowdata)):
                        insert_later.append(rowdata[i])

                #if not has_numbers and not did_have_numbers: # near BOF
                if insert_now is not None and columns is None:
                    columns = insert_now
                    insert_now = None

                    for i in range(len(columns)):
                        columns[i] = columns[i].replace("\n", " ")

                    # figure out if english names are separate or not
                    if len(columns) > 1 and columns[1].strip() == "Sector":
                        max_sector_column = 1

                elif insert_now is not None and len(insert_now) == len(
                        columns):
                    insert_row(insert_now, columns, max_sector_column)
                    insert_now = None
                else:
                    # we don't want to get here - debug
                    if insert_now is not None:
                        print(len(insert_now), len(columns), insert_now)

            # close the loop
            if insert_later is not None and len(insert_later) == len(columns):
                insert_row(insert_later, columns, max_sector_column)

            print(columns)

        xact.commit()