def main(workspace): DATA_DIR = ffs.Path(workspace) datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json')) dc.ensure_publisher('hscic') dc.ensure_group('ccgois') load_ccgois(datasets) group_ccgois(datasets)
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' dc.ensure_publisher('phe') dc.ensure_group('phof') load_phe() #group_phe() return 0
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' dc.ensure_publisher('hscic') dc.ensure_group('qof') load_qof() group_qof() return 0
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() dc.ensure_publisher('nhs-england') dc.ensure_group('surveys') datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), "r")) for dataset in datasets: if load_dataset(dataset, DATA_DIR): groups(dataset)
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() dc.ensure_publisher('nhs-england') dc.ensure_group('statistics') datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), "r")) for dataset in datasets: if load_statistic(dataset, DATA_DIR): groups(dataset)
def groups(dataset): groups = dataset.get('groups', []) dataset = dc.ckan.action.package_show(id=dataset["name"]) for grp in groups: if [g for g in dataset['groups'] if g['name'].lower() == grp]: print 'Already in group', g['name'] else: dc.ensure_group(grp) dc.ckan.action.member_create(id=grp, object=dataset['name'], object_type='package', capacity='member') return
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() datasets = json.load(get_resource_file(DATA_DIR / 'nhsof_metadata_indicators.json')) print "Ensuring publisher" dc.ensure_publisher('hscic') print "Ensuring group" dc.ensure_group('nhsof') wrote = load_nhsof(datasets) if wrote: group_nhsof(datasets) else: print "Created/processed no datasets ..."
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() dc.ensure_publisher('gp-survey') dc.ensure_group('surveys') def year_as_key(x): return x['title'][-4:] datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), "r")) datasets = sorted(datasets, key=year_as_key) for dataset in datasets: load_statistic(dataset, DATA_DIR) groups(dataset)
def groups(dataset): groups = dataset.get('groups', []) dataset = dc.ckan.action.package_show(id=dataset["name"]) for grp in groups: if [g for g in dataset['groups'] if g['name'].lower() == grp]: print 'Already in group', g['name'] else: dc.ensure_group(grp) dc.ckan.action.member_create( id=grp, object=dataset['name'], object_type='package', capacity='member' ) return
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() datasets = json.load( get_resource_file(DATA_DIR / 'nhsof_metadata_indicators.json')) print "Ensuring publisher" dc.ensure_publisher('hscic') print "Ensuring group" dc.ensure_group('nhsof') wrote = load_nhsof(datasets) if wrote: group_nhsof(datasets) else: print "Created/processed no datasets ..."
def curate_hscic_indicators(): """ Curate it! """ dc.ensure_group('nhsof') dc.ensure_group('ccgois') indicators = DATA_DIR/'indicators.json' data = indicators.json_load() for indicator in data: if is_indicator(indicator): framework = determine_framework(indicator) number = indicator['title'].split(' ')[0] print framework, number unique_id = 'hscic_indicator_'+ indicator['unique identifier'].lower() print unique_id newname = u'{0} {1}'.format(framework, indicator['title']) rename(unique_id, newname) add_tag(indicator, framework) add_to_group(indicator, framework) return
def curate_hscic_indicators(): """ Curate it! """ dc.ensure_group('nhsof') dc.ensure_group('ccgois') indicators = DATA_DIR / 'indicators.json' data = indicators.json_load() for indicator in data: if is_indicator(indicator): framework = determine_framework(indicator) number = indicator['title'].split(' ')[0] print framework, number unique_id = 'hscic_indicator_' + indicator[ 'unique identifier'].lower() print unique_id newname = u'{0} {1}'.format(framework, indicator['title']) rename(unique_id, newname) add_tag(indicator, framework) add_to_group(indicator, framework) return
def publish_datasets(start_from=0): global DATA_DIR u = Uploader("hscic-datasets") datasetfile = ffs.Path(get_resource_path('datasets.json')) logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() logging.info('Processing {} indicators'.format(len(datasets))) logging.info('Starting from record {}'.format(start_from)) import random total = len(datasets) - start_from current = 1 for dataset in datasets[start_from:]: print "STATUS: {}/{}".format(current, total) current += 1 #print u'Processing {}'.format(dataset['title']) #print ' ID: {}'.format(dataset['id']) try: resources = [] for s in dataset['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'], "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) resource['url'] = u.upload(path) """ resources.append(resource) if not resources: print "Dataset {} does not have any resources".format( dataset['id']) continue title = dataset['title'] c = Curator(dataset) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) name = slugify.slugify(title).lower()[0:99] # Call cleantags on each work and expect back a list, which is then flattened tags = [] if 'keywords' in dataset: dataset['keywords'] = sum([ clean_tag(k) for k in dataset.get('keywords', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join( dataset['key_facts']) notes = to_markdown(notes) name = 'hscic_dataset_{}'.format(dataset['id']) dc.Dataset.create_or_update(name=name, title=title, state='active', licence_id='ogl', notes=notes, url=dataset['source'], tags=tags, resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show(id=name) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ensure_group(group) dc.ckan.action.member_create(id=group, object=dataset['id'], object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() u.close() return
def publish_datasets(start_from=0): datasetfile = DATA_DIR / 'datasets.json' logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() amount = len(datasets) logging.info('Processing {} datasets'.format(amount)) logging.info('Starting from record {}'.format(start_from)) for dataset in datasets[start_from:10]: try: logging.info('{} of {}'.format(start_from, amount)) start_from += 1 logging.info('Processing {}'.format(dataset['title'])) logging.info('ID: {}'.format(dataset['id'])) resources = [ dict(description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url'])) for s in dataset['sources'] ] notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts'] extras = [ { 'key': 'Public Access Level', 'value': 'Public', }, { 'key': 'Data Quality Assurance', 'value': 'False' }, { 'key': 'Status', 'value': 'Live', }, ] if 'date_range' in dataset: extras.append({ 'key': 'Time period', 'value': dataset['date_range'], }) if 'publication_date' in dataset: extras.append({ 'key': 'Release date', 'value': dataset['publication_date'], }) if 'geographical_coverage' in dataset: extras.append({ 'key': 'Geographical coverage', 'value': ', '.join(dataset['geographical_coverage']) }) # groups groups = [] for item in dataset['topics']: groups.append(item) for item in dataset['information_types']: groups.append(item) group_faff = [] for g in groups: group_name = dc.ensure_group(g, 'HSCIC') group_faff.append({ 'name': group_name, }) name = 'hscic_dataset_{}'.format(dataset['id']) # NHSEngland metadata as comments... dc.Dataset.create_or_update( name=name, # Unique ID title=dataset['title'], # title notes=notes, # description tags=dc.tags(*dataset['keywords']), # tags extras=extras, state='active', licence_id='ogl', url=dataset['source'], resources=resources, groups=group_faff, owner_org='hscic' # publisher ) except Exception as ex: logging.error(ex) return
def main(): dc.ensure_publisher('hscic') dc.ensure_group('indicators', 'hscic') publish_indicators() publish_datasets() return 0
def publish_datasets(start_from=0): datasetfile = DATA_DIR/'datasets.json' logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() amount = len(datasets) logging.info('Processing {} datasets'.format(amount)) logging.info('Starting from record {}'.format(start_from)) for dataset in datasets[start_from:10]: try: logging.info('{} of {}'.format(start_from, amount)) start_from += 1 logging.info('Processing {}'.format(dataset['title'])) logging.info('ID: {}'.format(dataset['id'])) resources = [ dict( description=s['description'], name=s['url'].split('/')[-1], format=s['filetype'], upload=dc.fh_for_url(s['url']) ) for s in dataset['sources'] ] notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts'] extras = [ {'key': 'Public Access Level', 'value': 'Public',}, {'key': 'Data Quality Assurance', 'value': 'False'}, {'key': 'Status', 'value': 'Live',}, ] if 'date_range' in dataset: extras.append({ 'key': 'Time period', 'value': dataset['date_range'], }) if 'publication_date' in dataset: extras.append({ 'key': 'Release date', 'value': dataset['publication_date'], }) if 'geographical_coverage' in dataset: extras.append({ 'key': 'Geographical coverage', 'value': ', '.join(dataset['geographical_coverage']) }) # groups groups = [] for item in dataset['topics']: groups.append(item) for item in dataset['information_types']: groups.append(item) group_faff = [] for g in groups: group_name = dc.ensure_group(g, 'HSCIC') group_faff.append({ 'name': group_name, }) name = 'hscic_dataset_{}'.format(dataset['id']) # NHSEngland metadata as comments... dc.Dataset.create_or_update( name=name, # Unique ID title=dataset['title'], # title notes=notes, # description tags=dc.tags(*dataset['keywords']), # tags extras=extras, state='active', licence_id='ogl', url=dataset['source'], resources=resources, groups=group_faff, owner_org='hscic' # publisher ) except Exception as ex: logging.error(ex) return