def scrape_page(mt_file, url): from .util import scrape_urls_from_web_page doc = MetatabDoc(mt_file) doc['resources'].new_term('DownloadPage', url) d = scrape_urls_from_web_page(url) for k, v in d['sources'].items(): doc['Resources'].new_term('DataFile', v['url'], description=v.get('description')) for k, v in d['external_documentation'].items(): term_name = classify_url(v['url']) doc['Documentation'].new_term(term_name, v['url'], description=v.get('description')) doc.write_csv(mt_file)
def rewrite_resource_format(mt_file): doc = MetatabDoc(mt_file) if 'schema' in doc: table_schemas = {t.value: t.as_dict() for t in doc['schema']} del doc['schema'] for resource in doc['resources']: s = resource.new_child('Schema', '') for column in table_schemas.get(resource.find_value('name'), {})['column']: c = s.new_child('column', column['name']) for k, v in column.items(): if k != 'name': c.new_child(k, v) doc.write_csv(mt_file)
sequences = { row.table_id: (row.sequence_number, row.start, row.table_cells) for row in sequence_p if row.start } root_doc = MetatabDoc() root = root_doc.new_section('Root') root.new_term('Declare', 'http://assets.metatab.org/census.csv') root.new_term('Title', 'American Community Survey, 5 Year, 2009-2014') root.new_term('Release', 5) root.new_term('Year', 2014) root.new_term('Include', 'acs20145-sources.csv') root.new_term('Include', 'acs20145-schema.csv') root_doc.write_csv('acs20145-metadata.csv') src_doc = MetatabDoc() source_sec = src_doc.new_section('Sources', ['geography', 'state']) from censuslib import ACS09TableRowGenerator as TableRowGenerator b = l.bundle('census.gov-acs-p5ye2014') b = b.cast_to_subclass() s = b.source('b00001') # Any source will do trg = TableRowGenerator(b, s) for s1, s2 in trg.generate_source_refs(): # S1 and S2 are for the estimates and margins file, # so we nly need one of them