Esempio n. 1
0
def scrape_page(mt_file, url):
    from .util import scrape_urls_from_web_page

    doc = MetatabDoc(mt_file)

    doc['resources'].new_term('DownloadPage', url)

    d = scrape_urls_from_web_page(url)

    for k, v in d['sources'].items():
        doc['Resources'].new_term('DataFile', v['url'], description=v.get('description'))

    for k, v in d['external_documentation'].items():
        term_name = classify_url(v['url'])
        doc['Documentation'].new_term(term_name, v['url'], description=v.get('description'))

    doc.write_csv(mt_file)
Esempio n. 2
0
def rewrite_resource_format(mt_file):
    doc = MetatabDoc(mt_file)

    if 'schema' in doc:
        table_schemas = {t.value: t.as_dict() for t in doc['schema']}
        del doc['schema']

        for resource in doc['resources']:
            s = resource.new_child('Schema', '')
            for column in table_schemas.get(resource.find_value('name'), {})['column']:
                c = s.new_child('column', column['name'])

                for k, v in column.items():
                    if k != 'name':
                        c.new_child(k, v)

    doc.write_csv(mt_file)
Esempio n. 3
0
sequences = {
    row.table_id: (row.sequence_number, row.start, row.table_cells)
    for row in sequence_p if row.start
}

root_doc = MetatabDoc()
root = root_doc.new_section('Root')

root.new_term('Declare', 'http://assets.metatab.org/census.csv')
root.new_term('Title', 'American Community Survey, 5 Year, 2009-2014')
root.new_term('Release', 5)
root.new_term('Year', 2014)
root.new_term('Include', 'acs20145-sources.csv')
root.new_term('Include', 'acs20145-schema.csv')

root_doc.write_csv('acs20145-metadata.csv')

src_doc = MetatabDoc()
source_sec = src_doc.new_section('Sources', ['geography', 'state'])

from censuslib import ACS09TableRowGenerator as TableRowGenerator

b = l.bundle('census.gov-acs-p5ye2014')
b = b.cast_to_subclass()
s = b.source('b00001')  # Any source will do

trg = TableRowGenerator(b, s)

for s1, s2 in trg.generate_source_refs():
    # S1 and S2 are for the estimates and margins file,
    # so we nly need one of them