def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() metadata_file = DATA_DIR / 'dataset.metadata.json' datasets = json.load(open(metadata_file, 'r')) u = Uploader("ods") unzipper = Unzipper() for dataset in datasets: has_zip = False for resource in dataset['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url if resource['format'].upper() == 'ZIP': has_zip = True if has_zip: print "Processing ZIP files in dataset" print '*' * 30 unzipper.unzip(dataset) print '*' * 30 u.close() json.dump(datasets, open(metadata_file, 'w'))
def add_metadata_to_pp_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ["GP", "Population"] title = metadata['title'] #begins = datetime.date(year=int(match.group(1)), month=4, day=1) #ends = datetime.date(begins.year + 1, 3, 31) #metadata['coverage_start_date'] = begins.isoformat() #metadata['coverage_end_date'] = ends.isoformat() metadata['frequency'] = 'Quarterly' print metadata['title'] u = Uploader("pp") for resource in metadata['sources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] resource['name'] = resource['url'].split('/')[-1] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def add_metadata_to_qof_datasets(): u = Uploader("nshof") f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json") datasets = json.load(open(f)) for metadata in datasets: metadata['tags'] = ['QOF', 'Quality Outcomes Framework'] title = metadata['title'] #metadata['frequency'] = 'yearly' #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2)) resources = [] for resource in metadata['sources']: resource['format'] = resource['filetype'] resource['name'] = resource['url'].split('/')[-1] resource['url_type'] = '' filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url resources.append(resource) metadata['resources'] = resources u.close() json.dump( datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w")) return
def unzip(self, package): """ Processes each resource in the package, and possibly also adds more resources if we manage to extract some. """ extra_resources = [] self.uploader = Uploader("unzipped/{}".format(package['name'])) updated_resources = [] print " +", package['name'] for resource in package['resources']: if not resource['format'] == 'ZIP': updated_resources.append(resource) continue updated_resources.append(resource) print " +", resource['name'] extract_zip_to = self.unzip_file(resource['url']) print " + Processing files in ", extract_zip_to files = [] for (dirpath, dirnames, filenames) in os.walk(extract_zip_to): files.extend([os.path.join(dirpath, p) for p in filenames]) for f in files: res = self.local_file_to_resource(f, resource) if res: updated_resources.append(res) shutil.rmtree(extract_zip_to) package['resources'] = updated_resources self.uploader.close() if extra_resources: package['resources'].extend(extra_resources)
def add_metadata_to_ascof_datasets(): metadata_file = DATA_DIR/'dataset.metadata.json' metadata = metadata_file.json_load() metadata['tags'] = ['PHOF', 'Public Health Outcomes Framework'] metadata['title'] ='PHOF - Public Health Outcomes Framework' metadata['frequency'] = 'yearly' metadata['summary'] = PHOF_SUMMARY metadata['source'] = 'http://www.phoutcomes.info/public-health-outcomes-framework' metadata['coverage_start_date'] = '2000-01-01' metadata['coverage_end_date'] = '2013-12-31' u = Uploader("phof") for resource in metadata['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def add_metadata_to_ascof_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework'] title = metadata['title'] match = re.search('(\d{4})-(\d{2})', title) begins = datetime.date(year=int(match.group(1)), month=4, day=1) ends = datetime.date(begins.year + 1, 3, 31) metadata['coverage_start_date'] = begins.isoformat() metadata['coverage_end_date'] = ends.isoformat() metadata['frequency'] = 'yearly' metadata['title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(match.group(1), match.group(2)) u = Uploader("ascof") for resource in metadata['resources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def add_metadata_to_ascof_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework'] title = metadata['title'] match = re.search('(\d{4})-(\d{2})', title) begins = datetime.date(year=int(match.group(1)), month=4, day=1) ends = datetime.date(begins.year + 1, 3, 31) metadata['coverage_start_date'] = begins.isoformat() metadata['coverage_end_date'] = ends.isoformat() metadata['frequency'] = 'yearly' metadata[ 'title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format( match.group(1), match.group(2)) u = Uploader("ascof") for resource in metadata['resources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r')) u = Uploader("gp-survey") for dataset in datasets: print "Processing", dataset['name'] print "..fetching resources" for resource in dataset["resources"]: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url u.close() json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb')) return 0
def add_metadata_to_qof_datasets(): u = Uploader("nshof") f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json") datasets = json.load(open(f)) for metadata in datasets: metadata['tags'] = ['QOF', 'Quality Outcomes Framework'] title = metadata['title'] #metadata['frequency'] = 'yearly' #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2)) resources = [] for resource in metadata['sources']: resource['format'] = resource['filetype'] resource['name'] = resource['url'].split('/')[-1] resource['url_type'] = '' filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url resources.append(resource) metadata['resources'] = resources u.close() json.dump(datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w")) return
def add_metadata_to_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ['Mental Health'] u = Uploader("mhmds") for resource in metadata['resources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def retrieve_qof_datasets(datasets): results = [] u = Uploader("qof") for dataset in datasets: print dataset['title'] for resource in dataset['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url results.append(dataset) u.close() metadata_file = DATA_DIR / 'dataset.metadata.json' if metadata_file: metadata_file.truncate() metadata_file << json.dumps(results, indent=2)
def retrieve_qof_datasets(datasets): results = [] u = Uploader("qof") for dataset in datasets: print dataset['title'] for resource in dataset['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url results.append(dataset) u.close() metadata_file = DATA_DIR/'dataset.metadata.json' if metadata_file: metadata_file.truncate() metadata_file << json.dumps(results, indent=2)
def main(workspace): DATA_DIR = ffs.Path(workspace) datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json')) u = Uploader("ccgois") for dataset in datasets: resources = [] for resource in dataset['sources']: resource['format'] = resource['filetype'] resource['name'] = resource['url'].split('/')[-1] filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url resources.append(resource) dataset['resources'] = resources u.close() json.dump(datasets, open(DATA_DIR / 'ccgois_indicators.json', 'w'))
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r')) tag_list = ["Statistics"] u = Uploader("stats") for dataset in datasets: print "Processing", dataset['name'] print "..adding tags" tags = dataset.get('tags', []) for t in tag_list: if not t in tags: tags.append(t) dataset['tags'] = tags print "..fetching resources" for resource in dataset["resources"]: filename = filename_for_resource(resource) path = DATA_DIR / filename try: download_file(resource['url'], path) except: continue print "Uploading to S3" url = u.upload(path) resource['url'] = url resource['url_type'] = '' # make sure we zap historical uploads u.close() json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb')) return 0
class Unzipper(object): def unzip(self, package): """ Processes each resource in the package, and possibly also adds more resources if we manage to extract some. """ extra_resources = [] self.uploader = Uploader("unzipped/{}".format(package['name'])) updated_resources = [] print " +", package['name'] for resource in package['resources']: if not resource['format'] == 'ZIP': updated_resources.append(resource) continue updated_resources.append(resource) print " +", resource['name'] extract_zip_to = self.unzip_file(resource['url']) print " + Processing files in ", extract_zip_to files = [] for (dirpath, dirnames, filenames) in os.walk(extract_zip_to): files.extend([os.path.join(dirpath, p) for p in filenames]) for f in files: res = self.local_file_to_resource(f, resource) if res: updated_resources.append(res) shutil.rmtree(extract_zip_to) package['resources'] = updated_resources self.uploader.close() if extra_resources: package['resources'].extend(extra_resources) def local_file_to_resource(self, local_file, parent_resource): print "Adding {} from {}".format(local_file, parent_resource['name']) if local_file.lower().endswith('.zip'): return None filename = local_file.split('/')[-1] parent_desc = parent_resource['description'] if parent_desc == parent_resource['name']: parent_desc = "" description = u"(Extracted from {}) {}".format(parent_resource['name'], parent_desc) resource = { "description": description.replace('\u00a0', ' '), "name": filename, "format": filename.split('.')[-1].upper(), } hash_file = create_hash_file(local_file) try: url = self.uploader.upload(local_file) resource['url'] = url except Exception, e: print e return None os.unlink(hash_file) return resource
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' org = dgu.action.organization_show(id=TARGET_ORGANISATION) if not _org_existsp(TARGET_ORGANISATION): catalogue.action.organization_create( name=org['name'], title=org['title'], description=org['description'], image_url=org['image_display_url'] ) print "Found {0} datasets on source".format(len(org['packages'])) for package in org['packages']: print 'uploading', package['title'].encode('utf8') dataset_dir = DATA_DIR/package['name'] # Get the dataset from DGU dataset = dgu.action.package_show(id=package['name']) del dataset['id'] # Set the new owning organisation dataset['owner_org'] = org['name'] u = Uploader("hqip") for resource in dataset['resources']: resource['name'] = resource['description'] if resource['format'] == "HTML": continue if resource['url'].startswith('hhttps'): resource['url'] = resource['url'].replace('hhttps', 'https') if 'cache_filepath' in resource: del resource['cache_filepath'] if 'tracking_summary' in resource: del resource['tracking_summary'] filename = filename_for_resource(resource) datafile = dataset_dir/filename print 'downloading', resource['url'], 'as', datafile try: download_file(resource['url'], datafile) print "Uploading to S3" url = u.upload(datafile) resource['url'] = url except: print '***' * 30 print "Failed to download: ", resource['url'] u.close() # Add a nice tag so we can find them all again dataset['tags'].append({'name': 'HQIP' }) print 'Owner org is', org['name'] try: extras = [] if 'temporal_coverage-from' in dataset: extras.append(dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from']))) if 'temporal_coverage' in dataset: extras.append(dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to']))) if 'frequency' in dataset: extras.append(dict(key='frequency', value=dataset['update_frequency'])) new_dataset = Dataset.create_or_update( name=dataset['name'], title=dataset['title'], state='active', visibility='private', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['url'], tags=dataset['tags'], resources=dataset['resources'], owner_org=org['name'], extras=extras ) print "Created {}".format(dataset['name']) except ValueError as e: print 'skipping because error', e continue except ValidationError: raise print "Failed to upload {}".format(dataset['name'])
def publish_indicators(start_from=0): global DATA_DIR u = Uploader("hscic-indicators") indicatorfile = ffs.Path(get_resource_path('indicators.json')) logging.info('Loading {}'.format(indicatorfile)) indicators = indicatorfile.json_load() logging.info('Processing {} indicators'.format(len(indicators))) logging.info('Starting from record {}'.format(start_from)) for indicator in indicators[start_from:]: try: resources = [] for s in indicator['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'].upper(), "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url """ resources.append(resource) if not 'indicators' in indicator['keyword(s)']: indicator['keyword(s)'].append('indicators') title = indicator['title'] c = Curator(indicator) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) tags = [] if 'keyword(s)' in dataset: dataset['keyword(s)'] = sum([ clean_tag(k) for k in indicator.get('keyword(s)', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) print '+ Create/Update dataset {}'.format(indicator['title']) dc.Dataset.create_or_update( name=slugify.slugify(title).lower()[:99], title=title, state='active', licence_id='ogl', notes=to_markdown(indicator['definition'].encode('utf8')), url='https://indicators.ic.nhs.uk/webview/', tags=dc.tags(tags), resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show( id=slugify.slugify(title)[:99].lower()) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ckan.action.member_create(id=group, object=dataset_name, object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() import sys sys.exit(1) u.close() return
def publish_datasets(start_from=0): global DATA_DIR u = Uploader("hscic-datasets") datasetfile = ffs.Path(get_resource_path('datasets.json')) logging.info('Loading {}'.format(datasetfile)) datasets = datasetfile.json_load() logging.info('Processing {} indicators'.format(len(datasets))) logging.info('Starting from record {}'.format(start_from)) import random total = len(datasets) - start_from current = 1 for dataset in datasets[start_from:]: print "STATUS: {}/{}".format(current, total) current += 1 #print u'Processing {}'.format(dataset['title']) #print ' ID: {}'.format(dataset['id']) try: resources = [] for s in dataset['sources']: resource = { "description": s['description'], "name": s['url'].split('/')[-1], "format": s['filetype'], "url": s["url"] } """ filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) resource['url'] = u.upload(path) """ resources.append(resource) if not resources: print "Dataset {} does not have any resources".format( dataset['id']) continue title = dataset['title'] c = Curator(dataset) groups = c.get_groups() if not groups: print "Not in a group" continue prefix = c.get_title_prefix() if prefix: title = u"{} - {}".format(prefix, title) name = slugify.slugify(title).lower()[0:99] # Call cleantags on each work and expect back a list, which is then flattened tags = [] if 'keywords' in dataset: dataset['keywords'] = sum([ clean_tag(k) for k in dataset.get('keywords', []) if len(k) > 2 ], []) tags = dc.tags(*dataset['keywords']) notes = dataset['summary'] if 'key_facts' in dataset: notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join( dataset['key_facts']) notes = to_markdown(notes) name = 'hscic_dataset_{}'.format(dataset['id']) dc.Dataset.create_or_update(name=name, title=title, state='active', licence_id='ogl', notes=notes, url=dataset['source'], tags=tags, resources=resources, owner_org='hscic') if groups: try: dataset = dc.ckan.action.package_show(id=name) except: continue for group in groups: group = group.lower() if [ g for g in dataset.get('groups', []) if g['name'] == group ]: print 'Already in group', g['name'] else: dc.ensure_group(group) dc.ckan.action.member_create(id=group, object=dataset['id'], object_type='package', capacity='member') except Exception as ex: import traceback traceback.print_exc() u.close() return
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' org = dgu.action.organization_show(id=TARGET_ORGANISATION) if not _org_existsp(TARGET_ORGANISATION): catalogue.action.organization_create( name=org['name'], title=org['title'], description=org['description'], image_url=org['image_display_url']) print "Found {0} datasets on source".format(len(org['packages'])) for package in org['packages']: print 'uploading', package['title'].encode('utf8') dataset_dir = DATA_DIR / package['name'] # Get the dataset from DGU dataset = dgu.action.package_show(id=package['name']) del dataset['id'] # Set the new owning organisation dataset['owner_org'] = org['name'] u = Uploader("hqip") for resource in dataset['resources']: resource['name'] = resource['description'] if resource['format'] == "HTML": continue if resource['url'].startswith('hhttps'): resource['url'] = resource['url'].replace('hhttps', 'https') if 'cache_filepath' in resource: del resource['cache_filepath'] if 'tracking_summary' in resource: del resource['tracking_summary'] filename = filename_for_resource(resource) datafile = dataset_dir / filename print 'downloading', resource['url'], 'as', datafile try: download_file(resource['url'], datafile) print "Uploading to S3" url = u.upload(datafile) resource['url'] = url except: print '***' * 30 print "Failed to download: ", resource['url'] u.close() # Add a nice tag so we can find them all again dataset['tags'].append({'name': 'HQIP'}) print 'Owner org is', org['name'] try: extras = [] if 'temporal_coverage-from' in dataset: extras.append( dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from']))) if 'temporal_coverage' in dataset: extras.append( dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to']))) if 'frequency' in dataset: extras.append( dict(key='frequency', value=dataset['update_frequency'])) new_dataset = Dataset.create_or_update( name=dataset['name'], title=dataset['title'], state='active', visibility='private', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['url'], tags=dataset['tags'], resources=dataset['resources'], owner_org=org['name'], extras=extras) print "Created {}".format(dataset['name']) except ValueError as e: print 'skipping because error', e continue except ValidationError: raise print "Failed to upload {}".format(dataset['name'])