def publish_indicators(start_from=0): indicatorfile = DATA_DIR / 'indicators.json' logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() logging.info('Processing {} indicators'.format(len(indicators))) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:]: logging.info('Processing {}'.format(indicator['title'])) logging.info('ID: {}'.format(indicator['unique identifier'].lower())) try: resources = [ dict(description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url'])) for s in indicator['sources'] ] dc.Dataset.create_or_update( name=indicator['unique identifier'].lower(), title=indicator['title'], state='active', licence_id='ogl', notes=indicator['definition'], url='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(*indicator['keyword(s)']), resources=resources, owner_org='hscic') except Exception as ex: logging.error(ex) return
def load_statistic(dataset, directory): if '2015' not in dataset['title']: print 'Skipping', dataset['title'].encode('utf8'), dataset['name'].encode('utf8') return print 'Creating', dataset['title'].encode('utf8'), dataset['name'].encode('utf8') try: extras = [] if dataset.get('coverage_start_date', ''): extras.append(dict(key='coverage_start_date', value=dataset['coverage_start_date'])) if dataset.get('coverage_end_date', ''): extras.append(dict(key='coverage_end_date', value=dataset['coverage_end_date'])) if dataset.get('frequency', ''): extras.append(dict(key='frequency', value=dataset['frequency'])) dc.Dataset.create_or_update( name=dataset['name'], title=dataset['title'], state='active', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['origin'], tags=dc.tags(*dataset['tags']), resources=dataset["resources"], owner_org='nhs-england', extras=extras, ) return True except Exception, e: print "ERROR: Problem updating/creating dataset - {}".format(dataset['name']) print e
def publish_ods(): """ Do Useful Work Here """ metadatafile = DATA_DIR/'ods.json' metadata = metadatafile.json_load() for dataset in metadata: resources = [ dict( description=s['description'], name=s['url'].split('/')[-1], format=dc.filetype(s['url']), upload=dc.disk_fh_for_url(s['url']) ) for s in dataset['resources'] ] print resources dc.Dataset.create_or_update( name=dataset['title'].lower().replace(' ', '-'), title=dataset['title'], state='active', licence_id='ogl', notes=dataset['description'], url='http://systems.hscic.gov.uk/data/ods', tags=dc.tags('ODS', 'Organisation', 'Organization'), resources=resources, owner_org='hscic-ods' ) break return
def load_ascof(): for directory, metadata_file, metadata in datasets(): resources = [ dict(description=r['description'], name=r['url'].split('/')[-1], format=r['format'], url=r['url'] #upload=open(str(directory/r['url'].split('/')[-1]), 'r') ) for r in metadata['resources'] ] slug = slugify.slugify(metadata['title']).lower() print 'Creating', metadata['title'], slug dc.Dataset.create_or_update( name=slug, title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', extras=[ dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='frequency', value=metadata['frequency']), dict(key='publication_date', value=metadata['publication_date']) ]) return
def load_dataset(dataset, directory): print 'Creating', dataset['title'], dataset['name'] try: extras = [] if dataset.get('coverage_start_date', ''): extras.append(dict(key='coverage_start_date', value=dataset['coverage_start_date'])) if dataset.get('coverage_end_date', ''): extras.append(dict(key='coverage_end_date', value=dataset['coverage_end_date'])) if dataset.get('frequency', ''): extras.append(dict(key='frequency', value=dataset['frequency'])) dc.Dataset.create_or_update( name=dataset['name'], title=dataset['title'], state='active', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['origin'], tags=dc.tags(*dataset['tags']), resources=dataset["resources"], owner_org='hscic', extras=extras, coverage_start_date=dataset.get('coverage_start_date', ''), coverage_end_date=dataset.get('coverage_end_date', ''), ) except Exception, e: print "ERROR: Problem updating/creating dataset - {}".format(dataset['name']) import traceback traceback.print_exc() print ".{}.{}.".format(dataset['coverage_start_date'], dataset['coverage_end_date']) sys.exit(0)
def publish_ods(): """ Do Useful Work Here """ metadatafile = DATA_DIR / 'ods.json' metadata = metadatafile.json_load() for dataset in metadata: resources = [ dict(description=s['description'], name=s['url'].split('/')[-1], format=dc.filetype(s['url']), upload=dc.disk_fh_for_url(s['url'])) for s in dataset['resources'] ] print resources dc.Dataset.create_or_update(name=dataset['title'].lower().replace( ' ', '-'), title=dataset['title'], state='active', licence_id='ogl', notes=dataset['description'], url='http://systems.hscic.gov.uk/data/ods', tags=dc.tags('ODS', 'Organisation', 'Organization'), resources=resources, owner_org='hscic-ods') break return
def load_dataset(dataset, directory): print 'Creating', dataset['title'].encode('utf8'), dataset['name'].encode( 'utf8') try: extras = [] if dataset.get('coverage_start_date', ''): extras.append( dict(key='coverage_start_date', value=dataset['coverage_start_date'])) if dataset.get('coverage_end_date', ''): extras.append( dict(key='coverage_end_date', value=dataset['coverage_end_date'])) dc.Dataset.create_or_update(name=dataset['name'], title=dataset['title'], state='active', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['origin'], tags=dc.tags(*dataset['tags']), resources=dataset["resources"], owner_org='nhs-england', extras=extras, frequency='Annually') return True except Exception, e: print "ERROR: Problem updating/creating dataset - {}".format( dataset['name']) print e
def load_pp(): for directory, metadata_file, metadata in datasets(): resources = [ dict( description=r['description'], name=r['name'], format=r['filetype'].upper(), url=r['url'], url_type='', ) for r in metadata['sources'] ] slug = slugify.slugify(metadata['title']).lower() print 'Creating', metadata['title'], slug dc.Dataset.create_or_update( name=slug, title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', frequency=metadata['frequency'], extras=[ #dict(key='coverage_start_date', value=metadata['coverage_start_date']), #dict(key='coverage_end_date', value=metadata['coverage_end_date']), #dict(key='frequency', value=metadata['frequency']), dict(key='publication_date', value=metadata['publication_date']) ] ) return
def load_qof(): for metadata in datasets(): resources = [ dict( description=r['description'], name=r['name'], format=r['format'], url=r['url'] ) for r in metadata['resources'] ] print 'Creating', metadata['title'], "with {} resources".format(len(metadata['resources'])) dc.Dataset.create_or_update( name=slugify.slugify(metadata['title']).lower(), title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', frequency=metadata['frequency'], extras=[ dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='publication_date', value=metadata['publication_date']) ] ) print "... done" return
def publish_choose_and_book(): """ Do Useful Work Here """ for dataset in metadata: resources = [ dict( description=s['description'], name=s['description'], format=s['filetype'], upload=dc.disk_fh_for_url(s['url']) ) for s in dataset['resources'] ] dc.Dataset.create_or_update( name=dataset['title'].lower().replace(' ', '-'), title=dataset['title'], state='active', licence_id='ogl', notes=dataset['description'], url='http://www.chooseandbook.nhs.uk/staff/bau/reports', tags=dc.tags(*dataset['tags']), resources=resources, owner_org='choose-and-book' ) return
def load_ascof(): for directory, metadata_file, metadata in datasets(): resources = [ dict( description=r['description'], name=r['url'].split('/')[-1], format=r['format'], url=r['url'] #upload=open(str(directory/r['url'].split('/')[-1]), 'r') ) for r in metadata['resources'] ] slug = slugify.slugify(metadata['title']).lower() print 'Creating', metadata['title'], slug dc.Dataset.create_or_update( name=slug, title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', extras=[ dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='frequency', value=metadata['frequency']), dict(key='publication_date', value=metadata['publication_date']) ] ) return
def publish_indicators(start_from=0): indicatorfile = DATA_DIR/'indicators.json' logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() logging.info('Processing {} indicators'.format(len(indicators))) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:]: logging.info('Processing {}'.format(indicator['title'])) logging.info('ID: {}'.format(indicator['unique identifier'].lower())) try: resources = [ dict( description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url']) ) for s in indicator['sources'] ] dc.Dataset.create_or_update( name=indicator['unique identifier'].lower(), title=indicator['title'], state='active', licence_id='ogl', notes=indicator['definition'], url='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(*indicator['keyword(s)']), resources=resources, owner_org='hscic' ) except Exception as ex: logging.error(ex) return
def load_pp(): for directory, metadata_file, metadata in datasets(): resources = [ dict( description=r['description'], name=r['name'], format=r['filetype'].upper(), url=r['url'], url_type='', ) for r in metadata['sources'] ] slug = slugify.slugify(metadata['title']).lower() print 'Creating', metadata['title'], slug dc.Dataset.create_or_update( name=slug, title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', frequency=metadata['frequency'], extras=[ #dict(key='coverage_start_date', value=metadata['coverage_start_date']), #dict(key='coverage_end_date', value=metadata['coverage_end_date']), #dict(key='frequency', value=metadata['frequency']), dict(key='publication_date', value=metadata['publication_date']) ]) return
def publish_datasets(start_from=0): datasetfile = DATA_DIR / 'datasets.json' logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() logging.info('Processing {} indicators'.format(len(datasets))) logging.info('Starting from record {}'.format(start_from)) for dataset in datasets[start_from:]: logging.info('Processing {}'.format(dataset['title'])) logging.info('ID: {}'.format(dataset['id'])) try: resources = [ dict(description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url'])) for s in dataset['sources'] ] notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts'] name = 'hscic_dataset_{}'.format(dataset['id']) dc.Dataset.create_or_update(name=name, title=dataset['title'], state='active', licence_id='ogl', notes=notes, url=dataset['source'], tags=dc.tags(*dataset['keywords']), resources=resources, owner_org='hscic') except Exception as ex: logging.error(ex) return
def load_nhsof(datasets): counter = 0 # There are only 35 datasets from the scrape, why are we skipping 43. for metadata in datasets: #[43:]: counter += 1 resources = [] for r in metadata['sources']: resources.append({ 'description': r['description'], 'name': r['name'], 'format': r['format'], 'url': r['url'], }) print "Resources ready for upload" metadata['title'] = 'NHSOF - ' + metadata['title'] name = slugify.slugify(metadata['title']).lower()[:99] print u'Creating dataset: {}'.format(name) try: dc.Dataset.create_or_update( name=name, title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['description'], origin='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(*metadata['keyword(s)']), resources=resources, #frequency=['Other', ], owner_org='hscic', extras=[ dict(key='frequency', value='Other'), dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='domain', value=metadata['domain']), dict(key='origin', value='HSCIC'), dict(key='next_version_due', value=metadata['next version due']), dict(key='HSCIC_unique_id', value=metadata['unique identifier']), dict(key='homepage', value=metadata['homepage']), dict(key='status', value=metadata['status']), dict(key='language', value=metadata['language']), dict(key='release_date', value=metadata['current version uploaded']) ]) except: print u"Failed to create {}".format( slugify.slugify(metadata['title']).lower()[:99]) return counter
def publish_indicators(start_from=0): indicatorfile = DATA_DIR/'indicators.json' logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() amount = len(indicators) logging.info('Processing {} indicators'.format(amount)) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:10]: logging.info('{} of {}'.format(start_from, amount)) start_from += 1 try: logging.info('Processing {}'.format(indicator['title'])) logging.info('ID: {}'.format(indicator['unique identifier'].lower())) resources = [ dict( description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url']) ) for s in indicator['sources'] ] name = 'hscic_indicator_{}'.format(indicator['unique identifier'].lower()) # Metadata specified by NHSEngland identified in comments... dc.Dataset.create_or_update( name=name, # Unique ID title=indicator['title'], #title notes=indicator['definition'], # description tags=dc.tags(*indicator['keyword(s)']), # tags extras=[ {'key': 'Public Access Level', 'value': 'Public',}, {'key': 'Data Quality Assurance', 'value': 'False'}, {'key': 'Release Date', 'value': indicator['current version uploaded'],}, {'key': 'Status', 'value': 'Live',}, ], state='active', licence_id='ogl', url='https://indicators.ic.nhs.uk/webview/', resources=resources, groups=[ {'name': 'indicators'}, ], owner_org='hscic' # publisher ) except Exception as ex: logging.error(ex) return
def load_nhsof(datasets): counter = 0 # There are only 35 datasets from the scrape, why are we skipping 43. for metadata in datasets: #[43:]: counter += 1 resources = [] for r in metadata['sources']: resources.append({ 'description': r['description'], 'name': r['name'], 'format': r['format'], 'url': r['url'], }) print "Resources ready for upload" metadata['title'] = 'NHSOF - ' + metadata['title'] name = slugify.slugify(metadata['title']).lower()[:99] print u'Creating dataset: {}'.format(name) try: dc.Dataset.create_or_update( name=name, title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['description'], origin='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(*metadata['keyword(s)']), resources=resources, #frequency=['Other', ], owner_org='hscic', extras=[ dict(key='frequency', value='Other'), dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='domain', value=metadata['domain']), dict(key='origin', value='HSCIC'), dict(key='next_version_due', value=metadata['next version due']), dict(key='HSCIC_unique_id', value=metadata['unique identifier']), dict(key='homepage', value=metadata['homepage']), dict(key='status', value=metadata['status']), dict(key='language', value=metadata['language']), dict(key='release_date', value=metadata['current version uploaded']) ] ) except: print u"Failed to create {}".format(slugify.slugify(metadata['title']).lower()[:99]) return counter
def load_ccgois(datasets): for metadata in datasets: resources = [ dict(description=r['description'], name=r['name'], format=r['filetype'], url=r['url']) for r in metadata['resources'] ] print[r['name'] for r in metadata['resources']] metadata['title'] = u'CCGOIS - {}'.format(metadata['title']) metadata['name'] = make_name_from_title(metadata['title']) print u'Creating {}'.format(metadata['name']) dc.Dataset.create_or_update( name=metadata['name'], title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['description'], origin='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(*metadata['keyword(s)']), resources=resources, #frequency=[metadata['frequency'], ], owner_org='hscic', extras=[ dict(key='frequency', value=metadata.get('frequency', '')), dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='domain', value=metadata['domain']), dict(key='origin', value='HSCIC'), dict(key='next_version_due', value=metadata['next version due']), dict(key='nhs_OF_indicators', value=metadata['nhs_of_indicators']), dict(key='HSCIC_unique_id', value=metadata['unique identifier']), dict(key='homepage', value=metadata['homepage']), dict(key='status', value=metadata['status']), dict(key='language', value=metadata['language']), dict(key='assurance_level', value=metadata['assurance_level']), dict(key='release_date', value=metadata['current version uploaded']) ]) return
def load_phe(): metadata = json.load(open(DATA_DIR / 'dataset.metadata.json')) resources = [ dict(description=r['description'], name=r['name'], format=r['format'], url='url') for r in metadata['resources'] ] extras = [ dict(key='frequency', value=metadata['frequency']), ] if 'publication_date' in metadata: extras.append( dict(key='publication_date', value=metadata['publication_date'])) extras.append( dict(key='coverage_start_date', value=metadata.get('coverage_start_date', ''))) extras.append( dict(key='coverage_end_date', value=metadata.get('coverage_end_date'))) print extras print 'Creating', metadata['title'] dc.Dataset.create_or_update( name=slugify.slugify(metadata['title']).lower(), title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', #extras=extras ) return
def load_phe(): metadata = json.load(open(DATA_DIR/'dataset.metadata.json')) resources = [ dict( description=r['description'], name=r['name'], format=r['format'], url='url' ) for r in metadata['resources'] ] extras = [ dict(key='frequency', value=metadata['frequency']), ] if 'publication_date' in metadata: extras.append(dict(key='publication_date', value=metadata['publication_date'])) extras.append(dict(key='coverage_start_date', value=metadata.get('coverage_start_date',''))) extras.append(dict(key='coverage_end_date', value=metadata.get('coverage_end_date'))) print extras print 'Creating', metadata['title'] dc.Dataset.create_or_update( name=slugify.slugify(metadata['title']).lower(), title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['summary'], origin=metadata['source'], tags=dc.tags(*metadata['tags']), resources=resources, owner_org='hscic', #extras=extras ) return
def publish_datasets(start_from=0): datasetfile = DATA_DIR/'datasets.json' logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() logging.info('Processing {} indicators'.format(len(datasets))) logging.info('Starting from record {}'.format(start_from)) for dataset in datasets[start_from:]: logging.info('Processing {}'.format(dataset['title'])) logging.info('ID: {}'.format(dataset['id'])) try: resources = [ dict( description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url']) ) for s in dataset['sources'] ] notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts'] name = 'hscic_dataset_{}'.format(dataset['id']) dc.Dataset.create_or_update( name=name, title=dataset['title'], state='active', licence_id='ogl', notes=notes, url=dataset['source'], tags=dc.tags(*dataset['keywords']), resources=resources, owner_org='hscic' ) except Exception as ex: logging.error(ex) return
def load_ods(): for directory, metadata_file, metadata in datasets(): print 'Processing', metadata['title'], metadata['name'] try: dc.Dataset.create_or_update( name=metadata['name'], title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['notes'], origin=metadata['origin'], tags=dc.tags(*metadata['tags']), resources=metadata["resources"], owner_org='hscic', frequency=metadata['frequency'], extras=[ dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), # dict(key='publication_date', value=metadata['publication_date']) ] ) except: print "Failed to process", metadata['name'] return
def publish_datasets(start_from=0): datasetfile = DATA_DIR / 'datasets.json' logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() amount = len(datasets) logging.info('Processing {} datasets'.format(amount)) logging.info('Starting from record {}'.format(start_from)) for dataset in datasets[start_from:10]: try: logging.info('{} of {}'.format(start_from, amount)) start_from += 1 logging.info('Processing {}'.format(dataset['title'])) logging.info('ID: {}'.format(dataset['id'])) resources = [ dict(description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url'])) for s in dataset['sources'] ] notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts'] extras = [ { 'key': 'Public Access Level', 'value': 'Public', }, { 'key': 'Data Quality Assurance', 'value': 'False' }, { 'key': 'Status', 'value': 'Live', }, ] if 'date_range' in dataset: extras.append({ 'key': 'Time period', 'value': dataset['date_range'], }) if 'publication_date' in dataset: extras.append({ 'key': 'Release date', 'value': dataset['publication_date'], }) if 'geographical_coverage' in dataset: extras.append({ 'key': 'Geographical coverage', 'value': ', '.join(dataset['geographical_coverage']) }) # groups groups = [] for item in dataset['topics']: groups.append(item) for item in dataset['information_types']: groups.append(item) group_faff = [] for g in groups: group_name = dc.ensure_group(g, 'HSCIC') group_faff.append({ 'name': group_name, }) name = 'hscic_dataset_{}'.format(dataset['id']) # NHSEngland metadata as comments... dc.Dataset.create_or_update( name=name, # Unique ID title=dataset['title'], # title notes=notes, # description tags=dc.tags(*dataset['keywords']), # tags extras=extras, state='active', licence_id='ogl', url=dataset['source'], resources=resources, groups=group_faff, owner_org='hscic' # publisher ) except Exception as ex: logging.error(ex) return
def publish_indicators(start_from=0): indicatorfile = DATA_DIR / 'indicators.json' logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() amount = len(indicators) logging.info('Processing {} indicators'.format(amount)) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:10]: logging.info('{} of {}'.format(start_from, amount)) start_from += 1 try: logging.info('Processing {}'.format(indicator['title'])) logging.info('ID: {}'.format( indicator['unique identifier'].lower())) resources = [ dict(description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url'])) for s in indicator['sources'] ] name = 'hscic_indicator_{}'.format( indicator['unique identifier'].lower()) # Metadata specified by NHSEngland identified in comments... dc.Dataset.create_or_update( name=name, # Unique ID title=indicator['title'], #title notes=indicator['definition'], # description tags=dc.tags(*indicator['keyword(s)']), # tags extras=[ { 'key': 'Public Access Level', 'value': 'Public', }, { 'key': 'Data Quality Assurance', 'value': 'False' }, { 'key': 'Release Date', 'value': indicator['current version uploaded'], }, { 'key': 'Status', 'value': 'Live', }, ], state='active', licence_id='ogl', url='https://indicators.ic.nhs.uk/webview/', resources=resources, groups=[ { 'name': 'indicators' }, ], owner_org='hscic' # publisher ) except Exception as ex: logging.error(ex) return
for r in metadata['resources'] ] print [r['name'] for r in metadata['resources']] metadata['title'] = u'CCGOIS - {}'.format(metadata['title']) metadata['name'] = make_name_from_title(metadata['title']) print u'Creating {}'.format(metadata['name']) dc.Dataset.create_or_update( name=metadata['name'], title=metadata['title'], state='active', license_id='uk-ogl', notes=metadata['description'], origin='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(*metadata['keyword(s)']), resources=resources, #frequency=[metadata['frequency'], ], owner_org='hscic', extras=[ dict(key='frequency', value=metadata.get('frequency', '')), dict(key='coverage_start_date', value=metadata['coverage_start_date']), dict(key='coverage_end_date', value=metadata['coverage_end_date']), dict(key='domain', value=metadata['domain']), dict(key='origin', value='HSCIC'), dict(key='next_version_due', value=metadata['next version due']), dict(key='nhs_OF_indicators', value=metadata['nhs_of_indicators']), dict(key='HSCIC_unique_id', value=metadata['unique identifier']), dict(key='homepage', value=metadata['homepage']), dict(key='status', value=metadata['status']), dict(key='language', value=metadata['language']),
def publish_datasets(start_from=0): datasetfile = DATA_DIR/'datasets.json' logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() amount = len(datasets) logging.info('Processing {} datasets'.format(amount)) logging.info('Starting from record {}'.format(start_from)) for dataset in datasets[start_from:10]: try: logging.info('{} of {}'.format(start_from, amount)) start_from += 1 logging.info('Processing {}'.format(dataset['title'])) logging.info('ID: {}'.format(dataset['id'])) resources = [ dict( description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url']) ) for s in dataset['sources'] ] notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts'] extras = [ {'key': 'Public Access Level', 'value': 'Public',}, {'key': 'Data Quality Assurance', 'value': 'False'}, {'key': 'Status', 'value': 'Live',}, ] if 'date_range' in dataset: extras.append({ 'key': 'Time period', 'value': dataset['date_range'], }) if 'publication_date' in dataset: extras.append({ 'key': 'Release date', 'value': dataset['publication_date'], }) if 'geographical_coverage' in dataset: extras.append({ 'key': 'Geographical coverage', 'value': ', '.join(dataset['geographical_coverage']) }) # groups groups = [] for item in dataset['topics']: groups.append(item) for item in dataset['information_types']: groups.append(item) group_faff = [] for g in groups: group_name = dc.ensure_group(g, 'HSCIC') group_faff.append({ 'name': group_name, }) name = 'hscic_dataset_{}'.format(dataset['id']) # NHSEngland metadata as comments... dc.Dataset.create_or_update( name=name, # Unique ID title=dataset['title'], # title notes=notes, # description tags=dc.tags(*dataset['keywords']), # tags extras=extras, state='active', licence_id='ogl', url=dataset['source'], resources=resources, groups=group_faff, owner_org='hscic' # publisher ) except Exception as ex: logging.error(ex) return
def publish_datasets(start_from=0): global DATA_DIR u = Uploader("hscic-datasets") datasetfile = ffs.Path(get_resource_path('datasets.json')) logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() logging.info('Processing {} indicators'.format(len(datasets))) logging.info('Starting from record {}'.format(start_from)) import random total = len(datasets) - start_from current = 1 for dataset in datasets[start_from:]: print "STATUS: {}/{}".format(current, total) current += 1 #print u'Processing {}'.format(dataset['title']) #print ' ID: {}'.format(dataset['id']) try: resources = [] for s in dataset['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'], "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) resource['url'] = u.upload(path) """ resources.append(resource) if not resources: print "Dataset {} does not have any resources".format( dataset['id']) continue title = dataset['title'] c = Curator(dataset) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) name = slugify.slugify(title).lower()[0:99] # Call cleantags on each work and expect back a list, which is then flattened tags = [] if 'keywords' in dataset: dataset['keywords'] = sum([ clean_tag(k) for k in dataset.get('keywords', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join( dataset['key_facts']) notes = to_markdown(notes) name = 'hscic_dataset_{}'.format(dataset['id']) dc.Dataset.create_or_update(name=name, title=title, state='active', licence_id='ogl', notes=notes, url=dataset['source'], tags=tags, resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show(id=name) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ensure_group(group) dc.ckan.action.member_create(id=group, object=dataset['id'], object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() u.close() return
def publish_indicators(start_from=0): global DATA_DIR u = Uploader("hscic-indicators") indicatorfile = ffs.Path(get_resource_path('indicators.json')) logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() logging.info('Processing {} indicators'.format(len(indicators))) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:]: try: resources = [] for s in indicator['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'].upper(), "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url """ resources.append(resource) if not 'indicators' in indicator['keyword(s)']: indicator['keyword(s)'].append('indicators') title = indicator['title'] c = Curator(indicator) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) tags = [] if 'keyword(s)' in dataset: dataset['keyword(s)'] = sum([ clean_tag(k) for k in indicator.get('keyword(s)', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) print '+ Create/Update dataset {}'.format(indicator['title']) dc.Dataset.create_or_update( name=slugify.slugify(title).lower()[:99], title=title, state='active', licence_id='ogl', notes=to_markdown(indicator['definition'].encode('utf8')), url='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(tags), resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show( id=slugify.slugify(title)[:99].lower()) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ckan.action.member_create(id=group, object=dataset_name, object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() import sys sys.exit(1) u.close() return