Example #1
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    metadata_file = DATA_DIR / 'dataset.metadata.json'
    datasets = json.load(open(metadata_file, 'r'))

    u = Uploader("ods")
    unzipper = Unzipper()
    for dataset in datasets:
        has_zip = False

        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url

            if resource['format'].upper() == 'ZIP':
                has_zip = True

        if has_zip:
            print "Processing ZIP files in dataset"
            print '*' * 30
            unzipper.unzip(dataset)
            print '*' * 30
    u.close()

    json.dump(datasets, open(metadata_file, 'w'))
def add_metadata_to_pp_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ["GP", "Population"]
        title = metadata['title']
        #begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        #ends = datetime.date(begins.year + 1, 3, 31)
        #metadata['coverage_start_date'] = begins.isoformat()
        #metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'Quarterly'

        print metadata['title']
        u = Uploader("pp")

        for resource in metadata['sources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
            resource['name'] = resource['url'].split('/')[-1]
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
def add_metadata_to_qof_datasets():
    u = Uploader("nshof")

    f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json")
    datasets = json.load(open(f))

    for metadata in datasets:
        metadata['tags'] = ['QOF', 'Quality Outcomes Framework']
        title = metadata['title']
        #metadata['frequency'] = 'yearly'
        #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2))

        resources = []
        for resource in metadata['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]
            resource['url_type'] = ''

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        metadata['resources'] = resources

    u.close()

    json.dump(
        datasets,
        open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w"))
    return
Example #4
0
    def unzip(self, package):
        """ Processes each resource in the package, and possibly also
            adds more resources if we manage to extract some. """
        extra_resources = []

        self.uploader = Uploader("unzipped/{}".format(package['name']))

        updated_resources = []
        print " +", package['name']
        for resource in package['resources']:
            if not resource['format'] == 'ZIP':
                updated_resources.append(resource)
                continue

            updated_resources.append(resource)
            print "   +", resource['name']
            extract_zip_to = self.unzip_file(resource['url'])

            print "        + Processing files in ", extract_zip_to

            files = []
            for (dirpath, dirnames, filenames) in os.walk(extract_zip_to):
                files.extend([os.path.join(dirpath, p) for p in filenames])

            for f in files:
                res = self.local_file_to_resource(f, resource)
                if res:
                    updated_resources.append(res)

            shutil.rmtree(extract_zip_to)
        package['resources'] = updated_resources

        self.uploader.close()
        if extra_resources:
            package['resources'].extend(extra_resources)
Example #5
0
def add_metadata_to_pp_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ["GP", "Population"]
        title = metadata['title']
        #begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        #ends = datetime.date(begins.year + 1, 3, 31)
        #metadata['coverage_start_date'] = begins.isoformat()
        #metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'Quarterly'

        print metadata['title']
        u = Uploader("pp")

        for resource in metadata['sources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
            resource['name'] = resource['url'].split('/')[-1]
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Example #6
0
def add_metadata_to_ascof_datasets():
    metadata_file = DATA_DIR/'dataset.metadata.json'
    metadata = metadata_file.json_load()

    metadata['tags'] = ['PHOF', 'Public Health Outcomes Framework']
    metadata['title'] ='PHOF - Public Health Outcomes Framework'
    metadata['frequency'] = 'yearly'
    metadata['summary'] = PHOF_SUMMARY
    metadata['source'] = 'http://www.phoutcomes.info/public-health-outcomes-framework'

    metadata['coverage_start_date'] = '2000-01-01'
    metadata['coverage_end_date'] = '2013-12-31'

    u = Uploader("phof")
    for resource in metadata['resources']:
        filename = filename_for_resource(resource)
        path = DATA_DIR / filename

        download_file(resource['url'], path)
        print "Uploading to S3"
        url = u.upload(path)
        resource['url'] = url
    u.close()


    metadata_file.truncate()
    metadata_file << json.dumps(metadata, indent=2)
    return
Example #7
0
def add_metadata_to_ascof_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework']
        title = metadata['title']
        match = re.search('(\d{4})-(\d{2})', title)
        begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        ends = datetime.date(begins.year + 1, 3, 31)
        metadata['coverage_start_date'] = begins.isoformat()
        metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'yearly'
        metadata['title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(match.group(1), match.group(2))

        u = Uploader("ascof")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Example #8
0
def add_metadata_to_ascof_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework']
        title = metadata['title']
        match = re.search('(\d{4})-(\d{2})', title)
        begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        ends = datetime.date(begins.year + 1, 3, 31)
        metadata['coverage_start_date'] = begins.isoformat()
        metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'yearly'
        metadata[
            'title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(
                match.group(1), match.group(2))

        u = Uploader("ascof")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    u = Uploader("gp-survey")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0
Example #10
0
def add_metadata_to_qof_datasets():
    u = Uploader("nshof")

    f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json")
    datasets = json.load(open(f))

    for metadata in datasets:
        metadata['tags'] = ['QOF', 'Quality Outcomes Framework']
        title = metadata['title']
        #metadata['frequency'] = 'yearly'
        #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2))

        resources = []
        for resource in metadata['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]
            resource['url_type'] = ''

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        metadata['resources'] = resources


    u.close()

    json.dump(datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w"))
    return
Example #11
0
def add_metadata_to_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['Mental Health']

        u = Uploader("mhmds")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
def add_metadata_to_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['Mental Health']

        u = Uploader("mhmds")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return
Example #13
0
def retrieve_qof_datasets(datasets):
    results = []

    u = Uploader("qof")
    for dataset in datasets:
        print dataset['title']
        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
        results.append(dataset)

    u.close()

    metadata_file = DATA_DIR / 'dataset.metadata.json'
    if metadata_file:
        metadata_file.truncate()
    metadata_file << json.dumps(results, indent=2)
Example #14
0
def retrieve_qof_datasets(datasets):
    results = []

    u = Uploader("qof")
    for dataset in datasets:
        print dataset['title']
        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
        results.append(dataset)

    u.close()

    metadata_file = DATA_DIR/'dataset.metadata.json'
    if metadata_file:
        metadata_file.truncate()
    metadata_file << json.dumps(results, indent=2)
def main(workspace):
    DATA_DIR = ffs.Path(workspace)
    datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json'))

    u = Uploader("ccgois")
    for dataset in datasets:
        resources = []
        for resource in dataset['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        dataset['resources'] = resources
    u.close()

    json.dump(datasets, open(DATA_DIR / 'ccgois_indicators.json', 'w'))
Example #16
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    tag_list = ["Statistics"]
    u = Uploader("stats")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..adding tags"
        tags = dataset.get('tags', [])
        for t in tag_list:
            if not t in tags:
                tags.append(t)
        dataset['tags'] = tags

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            try:
                download_file(resource['url'], path)
            except:
                continue
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resource['url_type'] = ''  # make sure we zap historical uploads

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0
Example #17
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    tag_list = ["Statistics"]
    u = Uploader("stats")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..adding tags"
        tags = dataset.get('tags', [])
        for t in tag_list:
            if not t in tags:
                tags.append(t)
        dataset['tags'] = tags

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            try:
                download_file(resource['url'], path)
            except:
                continue
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resource['url_type'] = ''  # make sure we zap historical uploads

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0
Example #18
0
class Unzipper(object):
    def unzip(self, package):
        """ Processes each resource in the package, and possibly also
            adds more resources if we manage to extract some. """
        extra_resources = []

        self.uploader = Uploader("unzipped/{}".format(package['name']))

        updated_resources = []
        print " +", package['name']
        for resource in package['resources']:
            if not resource['format'] == 'ZIP':
                updated_resources.append(resource)
                continue

            updated_resources.append(resource)
            print "   +", resource['name']
            extract_zip_to = self.unzip_file(resource['url'])

            print "        + Processing files in ", extract_zip_to

            files = []
            for (dirpath, dirnames, filenames) in os.walk(extract_zip_to):
                files.extend([os.path.join(dirpath, p) for p in filenames])

            for f in files:
                res = self.local_file_to_resource(f, resource)
                if res:
                    updated_resources.append(res)

            shutil.rmtree(extract_zip_to)
        package['resources'] = updated_resources

        self.uploader.close()
        if extra_resources:
            package['resources'].extend(extra_resources)

    def local_file_to_resource(self, local_file, parent_resource):
        print "Adding {} from {}".format(local_file, parent_resource['name'])
        if local_file.lower().endswith('.zip'):
            return None

        filename = local_file.split('/')[-1]
        parent_desc = parent_resource['description']
        if parent_desc == parent_resource['name']:
            parent_desc = ""
        description = u"(Extracted from {}) {}".format(parent_resource['name'],
                                                       parent_desc)
        resource = {
            "description": description.replace('\u00a0', ' '),
            "name": filename,
            "format": filename.split('.')[-1].upper(),
        }

        hash_file = create_hash_file(local_file)
        try:
            url = self.uploader.upload(local_file)
            resource['url'] = url
        except Exception, e:
            print e
            return None

        os.unlink(hash_file)

        return resource
Example #19
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'

    org = dgu.action.organization_show(id=TARGET_ORGANISATION)

    if not _org_existsp(TARGET_ORGANISATION):
        catalogue.action.organization_create(
            name=org['name'],
            title=org['title'],
            description=org['description'],
            image_url=org['image_display_url']
        )

    print "Found {0} datasets on source".format(len(org['packages']))

    for package in org['packages']:
        print 'uploading', package['title'].encode('utf8')
        dataset_dir = DATA_DIR/package['name']
        # Get the dataset from DGU
        dataset = dgu.action.package_show(id=package['name'])
        del dataset['id']

        # Set the new owning organisation
        dataset['owner_org'] = org['name']

        u = Uploader("hqip")
        for resource in dataset['resources']:
            resource['name'] = resource['description']
            if resource['format'] == "HTML":
                continue
            if resource['url'].startswith('hhttps'):
                resource['url'] = resource['url'].replace('hhttps', 'https')

            if 'cache_filepath' in resource:
                del resource['cache_filepath']
            if 'tracking_summary' in resource:
                del resource['tracking_summary']

            filename = filename_for_resource(resource)

            datafile = dataset_dir/filename
            print 'downloading', resource['url'], 'as', datafile

            try:
                download_file(resource['url'], datafile)
                print "Uploading to S3"
                url = u.upload(datafile)
                resource['url'] = url
            except:
                print '***' * 30
                print "Failed to download: ", resource['url']
        u.close()

        # Add a nice tag so we can find them all again
        dataset['tags'].append({'name': 'HQIP' })
        print 'Owner org is', org['name']
        try:
            extras = []
            if 'temporal_coverage-from' in dataset:
                extras.append(dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from'])))
            if 'temporal_coverage' in dataset:
                extras.append(dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to'])))
            if 'frequency' in dataset:
                extras.append(dict(key='frequency', value=dataset['update_frequency']))

            new_dataset = Dataset.create_or_update(
                name=dataset['name'],
                title=dataset['title'],
                state='active',
                visibility='private',
                license_id='uk-ogl',
                notes=dataset['notes'],
                origin=dataset['url'],
                tags=dataset['tags'],
                resources=dataset['resources'],
                owner_org=org['name'],
                extras=extras
            )
            print "Created {}".format(dataset['name'])
        except ValueError as e:
            print 'skipping because error', e
            continue
        except ValidationError:
            raise
            print "Failed to upload {}".format(dataset['name'])
def publish_indicators(start_from=0):
    global DATA_DIR
    u = Uploader("hscic-indicators")

    indicatorfile = ffs.Path(get_resource_path('indicators.json'))
    logging.info('Loading {}'.format(indicatorfile))
    indicators = indicatorfile.json_load()
    logging.info('Processing {} indicators'.format(len(indicators)))
    logging.info('Starting from record {}'.format(start_from))
    for indicator in indicators[start_from:]:
        try:
            resources = []
            for s in indicator['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'].upper(),
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                print "Uploading to S3"
                url = u.upload(path)
                resource['url'] = url
                """
                resources.append(resource)

            if not 'indicators' in indicator['keyword(s)']:
                indicator['keyword(s)'].append('indicators')

            title = indicator['title']

            c = Curator(indicator)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)

            tags = []
            if 'keyword(s)' in dataset:
                dataset['keyword(s)'] = sum([
                    clean_tag(k)
                    for k in indicator.get('keyword(s)', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            print '+ Create/Update dataset {}'.format(indicator['title'])
            dc.Dataset.create_or_update(
                name=slugify.slugify(title).lower()[:99],
                title=title,
                state='active',
                licence_id='ogl',
                notes=to_markdown(indicator['definition'].encode('utf8')),
                url='https://indicators.ic.nhs.uk/webview/',
                tags=dc.tags(tags),
                resources=resources,
                owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(
                        id=slugify.slugify(title)[:99].lower())
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset_name,
                                                     object_type='package',
                                                     capacity='member')

        except Exception as ex:
            import traceback
            traceback.print_exc()
            import sys
            sys.exit(1)

    u.close()
    return
def publish_datasets(start_from=0):
    global DATA_DIR

    u = Uploader("hscic-datasets")

    datasetfile = ffs.Path(get_resource_path('datasets.json'))
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    logging.info('Processing {} indicators'.format(len(datasets)))
    logging.info('Starting from record {}'.format(start_from))

    import random
    total = len(datasets) - start_from
    current = 1

    for dataset in datasets[start_from:]:
        print "STATUS: {}/{}".format(current, total)
        current += 1

        #print u'Processing {}'.format(dataset['title'])
        #print '  ID: {}'.format(dataset['id'])
        try:
            resources = []
            for s in dataset['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'],
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                resource['url'] = u.upload(path)
                """
                resources.append(resource)

            if not resources:
                print "Dataset {} does not have any resources".format(
                    dataset['id'])
                continue

            title = dataset['title']

            c = Curator(dataset)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)
            name = slugify.slugify(title).lower()[0:99]

            # Call cleantags on each work and expect back a list, which is then flattened

            tags = []
            if 'keywords' in dataset:
                dataset['keywords'] = sum([
                    clean_tag(k)
                    for k in dataset.get('keywords', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join(
                    dataset['key_facts'])
            notes = to_markdown(notes)

            name = 'hscic_dataset_{}'.format(dataset['id'])

            dc.Dataset.create_or_update(name=name,
                                        title=title,
                                        state='active',
                                        licence_id='ogl',
                                        notes=notes,
                                        url=dataset['source'],
                                        tags=tags,
                                        resources=resources,
                                        owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(id=name)
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ensure_group(group)
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset['id'],
                                                     object_type='package',
                                                     capacity='member')
        except Exception as ex:
            import traceback
            traceback.print_exc()

    u.close()
    return
Example #22
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'

    org = dgu.action.organization_show(id=TARGET_ORGANISATION)

    if not _org_existsp(TARGET_ORGANISATION):
        catalogue.action.organization_create(
            name=org['name'],
            title=org['title'],
            description=org['description'],
            image_url=org['image_display_url'])

    print "Found {0} datasets on source".format(len(org['packages']))

    for package in org['packages']:
        print 'uploading', package['title'].encode('utf8')
        dataset_dir = DATA_DIR / package['name']
        # Get the dataset from DGU
        dataset = dgu.action.package_show(id=package['name'])
        del dataset['id']

        # Set the new owning organisation
        dataset['owner_org'] = org['name']

        u = Uploader("hqip")
        for resource in dataset['resources']:
            resource['name'] = resource['description']
            if resource['format'] == "HTML":
                continue
            if resource['url'].startswith('hhttps'):
                resource['url'] = resource['url'].replace('hhttps', 'https')

            if 'cache_filepath' in resource:
                del resource['cache_filepath']
            if 'tracking_summary' in resource:
                del resource['tracking_summary']

            filename = filename_for_resource(resource)

            datafile = dataset_dir / filename
            print 'downloading', resource['url'], 'as', datafile

            try:
                download_file(resource['url'], datafile)
                print "Uploading to S3"
                url = u.upload(datafile)
                resource['url'] = url
            except:
                print '***' * 30
                print "Failed to download: ", resource['url']
        u.close()

        # Add a nice tag so we can find them all again
        dataset['tags'].append({'name': 'HQIP'})
        print 'Owner org is', org['name']
        try:
            extras = []
            if 'temporal_coverage-from' in dataset:
                extras.append(
                    dict(key='coverage_start_date',
                         value=format_date(dataset['temporal_coverage-from'])))
            if 'temporal_coverage' in dataset:
                extras.append(
                    dict(key='coverage_end_date',
                         value=format_date(dataset['temporal_coverage-to'])))
            if 'frequency' in dataset:
                extras.append(
                    dict(key='frequency', value=dataset['update_frequency']))

            new_dataset = Dataset.create_or_update(
                name=dataset['name'],
                title=dataset['title'],
                state='active',
                visibility='private',
                license_id='uk-ogl',
                notes=dataset['notes'],
                origin=dataset['url'],
                tags=dataset['tags'],
                resources=dataset['resources'],
                owner_org=org['name'],
                extras=extras)
            print "Created {}".format(dataset['name'])
        except ValueError as e:
            print 'skipping because error', e
            continue
        except ValidationError:
            raise
            print "Failed to upload {}".format(dataset['name'])