def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() metadata_file = DATA_DIR / 'dataset.metadata.json' datasets = json.load(open(metadata_file, 'r')) u = Uploader("ods") unzipper = Unzipper() for dataset in datasets: has_zip = False for resource in dataset['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url if resource['format'].upper() == 'ZIP': has_zip = True if has_zip: print "Processing ZIP files in dataset" print '*' * 30 unzipper.unzip(dataset) print '*' * 30 u.close() json.dump(datasets, open(metadata_file, 'w'))
def add_metadata_to_pp_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ["GP", "Population"] title = metadata['title'] #begins = datetime.date(year=int(match.group(1)), month=4, day=1) #ends = datetime.date(begins.year + 1, 3, 31) #metadata['coverage_start_date'] = begins.isoformat() #metadata['coverage_end_date'] = ends.isoformat() metadata['frequency'] = 'Quarterly' print metadata['title'] u = Uploader("pp") for resource in metadata['sources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] resource['name'] = resource['url'].split('/')[-1] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def add_metadata_to_qof_datasets(): u = Uploader("nshof") f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json") datasets = json.load(open(f)) for metadata in datasets: metadata['tags'] = ['QOF', 'Quality Outcomes Framework'] title = metadata['title'] #metadata['frequency'] = 'yearly' #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2)) resources = [] for resource in metadata['sources']: resource['format'] = resource['filetype'] resource['name'] = resource['url'].split('/')[-1] resource['url_type'] = '' filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url resources.append(resource) metadata['resources'] = resources u.close() json.dump( datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w")) return
def add_metadata_to_ascof_datasets(): metadata_file = DATA_DIR/'dataset.metadata.json' metadata = metadata_file.json_load() metadata['tags'] = ['PHOF', 'Public Health Outcomes Framework'] metadata['title'] ='PHOF - Public Health Outcomes Framework' metadata['frequency'] = 'yearly' metadata['summary'] = PHOF_SUMMARY metadata['source'] = 'http://www.phoutcomes.info/public-health-outcomes-framework' metadata['coverage_start_date'] = '2000-01-01' metadata['coverage_end_date'] = '2013-12-31' u = Uploader("phof") for resource in metadata['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def add_metadata_to_ascof_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework'] title = metadata['title'] match = re.search('(\d{4})-(\d{2})', title) begins = datetime.date(year=int(match.group(1)), month=4, day=1) ends = datetime.date(begins.year + 1, 3, 31) metadata['coverage_start_date'] = begins.isoformat() metadata['coverage_end_date'] = ends.isoformat() metadata['frequency'] = 'yearly' metadata['title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(match.group(1), match.group(2)) u = Uploader("ascof") for resource in metadata['resources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def add_metadata_to_ascof_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework'] title = metadata['title'] match = re.search('(\d{4})-(\d{2})', title) begins = datetime.date(year=int(match.group(1)), month=4, day=1) ends = datetime.date(begins.year + 1, 3, 31) metadata['coverage_start_date'] = begins.isoformat() metadata['coverage_end_date'] = ends.isoformat() metadata['frequency'] = 'yearly' metadata[ 'title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format( match.group(1), match.group(2)) u = Uploader("ascof") for resource in metadata['resources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r')) u = Uploader("gp-survey") for dataset in datasets: print "Processing", dataset['name'] print "..fetching resources" for resource in dataset["resources"]: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url u.close() json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb')) return 0
def add_metadata_to_qof_datasets(): u = Uploader("nshof") f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json") datasets = json.load(open(f)) for metadata in datasets: metadata['tags'] = ['QOF', 'Quality Outcomes Framework'] title = metadata['title'] #metadata['frequency'] = 'yearly' #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2)) resources = [] for resource in metadata['sources']: resource['format'] = resource['filetype'] resource['name'] = resource['url'].split('/')[-1] resource['url_type'] = '' filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url resources.append(resource) metadata['resources'] = resources u.close() json.dump(datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w")) return
def add_metadata_to_datasets(): for directory, metadata_file, metadata in datasets(): metadata['tags'] = ['Mental Health'] u = Uploader("mhmds") for resource in metadata['resources']: print resource['url'] filename = filename_for_resource(resource) path = directory / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url print resource['url'] u.close() metadata_file.truncate() metadata_file << json.dumps(metadata, indent=2) return
def retrieve_qof_datasets(datasets): results = [] u = Uploader("qof") for dataset in datasets: print dataset['title'] for resource in dataset['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url results.append(dataset) u.close() metadata_file = DATA_DIR / 'dataset.metadata.json' if metadata_file: metadata_file.truncate() metadata_file << json.dumps(results, indent=2)
def retrieve_qof_datasets(datasets): results = [] u = Uploader("qof") for dataset in datasets: print dataset['title'] for resource in dataset['resources']: filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url results.append(dataset) u.close() metadata_file = DATA_DIR/'dataset.metadata.json' if metadata_file: metadata_file.truncate() metadata_file << json.dumps(results, indent=2)
def main(workspace): DATA_DIR = ffs.Path(workspace) datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json')) u = Uploader("ccgois") for dataset in datasets: resources = [] for resource in dataset['sources']: resource['format'] = resource['filetype'] resource['name'] = resource['url'].split('/')[-1] filename = filename_for_resource(resource) path = DATA_DIR / filename download_file(resource['url'], path) print "Uploading to S3" url = u.upload(path) resource['url'] = url resources.append(resource) dataset['resources'] = resources u.close() json.dump(datasets, open(DATA_DIR / 'ccgois_indicators.json', 'w'))
def main(workspace): DATA_DIR = ffs.Path(workspace) / 'data' DATA_DIR.mkdir() datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r')) tag_list = ["Statistics"] u = Uploader("stats") for dataset in datasets: print "Processing", dataset['name'] print "..adding tags" tags = dataset.get('tags', []) for t in tag_list: if not t in tags: tags.append(t) dataset['tags'] = tags print "..fetching resources" for resource in dataset["resources"]: filename = filename_for_resource(resource) path = DATA_DIR / filename try: download_file(resource['url'], path) except: continue print "Uploading to S3" url = u.upload(path) resource['url'] = url resource['url_type'] = '' # make sure we zap historical uploads u.close() json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb')) return 0
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' org = dgu.action.organization_show(id=TARGET_ORGANISATION) if not _org_existsp(TARGET_ORGANISATION): catalogue.action.organization_create( name=org['name'], title=org['title'], description=org['description'], image_url=org['image_display_url'] ) print "Found {0} datasets on source".format(len(org['packages'])) for package in org['packages']: print 'uploading', package['title'].encode('utf8') dataset_dir = DATA_DIR/package['name'] # Get the dataset from DGU dataset = dgu.action.package_show(id=package['name']) del dataset['id'] # Set the new owning organisation dataset['owner_org'] = org['name'] u = Uploader("hqip") for resource in dataset['resources']: resource['name'] = resource['description'] if resource['format'] == "HTML": continue if resource['url'].startswith('hhttps'): resource['url'] = resource['url'].replace('hhttps', 'https') if 'cache_filepath' in resource: del resource['cache_filepath'] if 'tracking_summary' in resource: del resource['tracking_summary'] filename = filename_for_resource(resource) datafile = dataset_dir/filename print 'downloading', resource['url'], 'as', datafile try: download_file(resource['url'], datafile) print "Uploading to S3" url = u.upload(datafile) resource['url'] = url except: print '***' * 30 print "Failed to download: ", resource['url'] u.close() # Add a nice tag so we can find them all again dataset['tags'].append({'name': 'HQIP' }) print 'Owner org is', org['name'] try: extras = [] if 'temporal_coverage-from' in dataset: extras.append(dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from']))) if 'temporal_coverage' in dataset: extras.append(dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to']))) if 'frequency' in dataset: extras.append(dict(key='frequency', value=dataset['update_frequency'])) new_dataset = Dataset.create_or_update( name=dataset['name'], title=dataset['title'], state='active', visibility='private', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['url'], tags=dataset['tags'], resources=dataset['resources'], owner_org=org['name'], extras=extras ) print "Created {}".format(dataset['name']) except ValueError as e: print 'skipping because error', e continue except ValidationError: raise print "Failed to upload {}".format(dataset['name'])
class Unzipper(object): def unzip(self, package): """ Processes each resource in the package, and possibly also adds more resources if we manage to extract some. """ extra_resources = [] self.uploader = Uploader("unzipped/{}".format(package['name'])) updated_resources = [] print " +", package['name'] for resource in package['resources']: if not resource['format'] == 'ZIP': updated_resources.append(resource) continue updated_resources.append(resource) print " +", resource['name'] extract_zip_to = self.unzip_file(resource['url']) print " + Processing files in ", extract_zip_to files = [] for (dirpath, dirnames, filenames) in os.walk(extract_zip_to): files.extend([os.path.join(dirpath, p) for p in filenames]) for f in files: res = self.local_file_to_resource(f, resource) if res: updated_resources.append(res) shutil.rmtree(extract_zip_to) package['resources'] = updated_resources self.uploader.close() if extra_resources: package['resources'].extend(extra_resources) def local_file_to_resource(self, local_file, parent_resource): print "Adding {} from {}".format(local_file, parent_resource['name']) if local_file.lower().endswith('.zip'): return None filename = local_file.split('/')[-1] parent_desc = parent_resource['description'] if parent_desc == parent_resource['name']: parent_desc = "" description = u"(Extracted from {}) {}".format(parent_resource['name'], parent_desc) resource = { "description": description.replace('\u00a0', ' '), "name": filename, "format": filename.split('.')[-1].upper(), } hash_file = create_hash_file(local_file) try: url = self.uploader.upload(local_file) resource['url'] = url except Exception, e: print e return None os.unlink(hash_file) return resource
def main(workspace): global DATA_DIR DATA_DIR = ffs.Path(workspace) / 'data' org = dgu.action.organization_show(id=TARGET_ORGANISATION) if not _org_existsp(TARGET_ORGANISATION): catalogue.action.organization_create( name=org['name'], title=org['title'], description=org['description'], image_url=org['image_display_url']) print "Found {0} datasets on source".format(len(org['packages'])) for package in org['packages']: print 'uploading', package['title'].encode('utf8') dataset_dir = DATA_DIR / package['name'] # Get the dataset from DGU dataset = dgu.action.package_show(id=package['name']) del dataset['id'] # Set the new owning organisation dataset['owner_org'] = org['name'] u = Uploader("hqip") for resource in dataset['resources']: resource['name'] = resource['description'] if resource['format'] == "HTML": continue if resource['url'].startswith('hhttps'): resource['url'] = resource['url'].replace('hhttps', 'https') if 'cache_filepath' in resource: del resource['cache_filepath'] if 'tracking_summary' in resource: del resource['tracking_summary'] filename = filename_for_resource(resource) datafile = dataset_dir / filename print 'downloading', resource['url'], 'as', datafile try: download_file(resource['url'], datafile) print "Uploading to S3" url = u.upload(datafile) resource['url'] = url except: print '***' * 30 print "Failed to download: ", resource['url'] u.close() # Add a nice tag so we can find them all again dataset['tags'].append({'name': 'HQIP'}) print 'Owner org is', org['name'] try: extras = [] if 'temporal_coverage-from' in dataset: extras.append( dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from']))) if 'temporal_coverage' in dataset: extras.append( dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to']))) if 'frequency' in dataset: extras.append( dict(key='frequency', value=dataset['update_frequency'])) new_dataset = Dataset.create_or_update( name=dataset['name'], title=dataset['title'], state='active', visibility='private', license_id='uk-ogl', notes=dataset['notes'], origin=dataset['url'], tags=dataset['tags'], resources=dataset['resources'], owner_org=org['name'], extras=extras) print "Created {}".format(dataset['name']) except ValueError as e: print 'skipping because error', e continue except ValidationError: raise print "Failed to upload {}".format(dataset['name'])