Python Uploader.uploadの例、publish.lib.upload.Uploader.upload Pythonの例

コード例 #1

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    metadata_file = DATA_DIR / 'dataset.metadata.json'
    datasets = json.load(open(metadata_file, 'r'))

    u = Uploader("ods")
    unzipper = Unzipper()
    for dataset in datasets:
        has_zip = False

        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url

            if resource['format'].upper() == 'ZIP':
                has_zip = True

        if has_zip:
            print "Processing ZIP files in dataset"
            print '*' * 30
            unzipper.unzip(dataset)
            print '*' * 30
    u.close()

    json.dump(datasets, open(metadata_file, 'w'))

コード例 #2

0

ファイルを表示

ファイル: transform.py プロジェクト: uk-gov-mirror/nhsengland.publish-o-matic

def add_metadata_to_pp_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ["GP", "Population"]
        title = metadata['title']
        #begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        #ends = datetime.date(begins.year + 1, 3, 31)
        #metadata['coverage_start_date'] = begins.isoformat()
        #metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'Quarterly'

        print metadata['title']
        u = Uploader("pp")

        for resource in metadata['sources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
            resource['name'] = resource['url'].split('/')[-1]
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #3

0

ファイルを表示

ファイル: transform.py プロジェクト: uk-gov-mirror/nhsengland.publish-o-matic

def add_metadata_to_qof_datasets():
    u = Uploader("nshof")

    f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json")
    datasets = json.load(open(f))

    for metadata in datasets:
        metadata['tags'] = ['QOF', 'Quality Outcomes Framework']
        title = metadata['title']
        #metadata['frequency'] = 'yearly'
        #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2))

        resources = []
        for resource in metadata['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]
            resource['url_type'] = ''

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        metadata['resources'] = resources

    u.close()

    json.dump(
        datasets,
        open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w"))
    return

コード例 #4

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def add_metadata_to_pp_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ["GP", "Population"]
        title = metadata['title']
        #begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        #ends = datetime.date(begins.year + 1, 3, 31)
        #metadata['coverage_start_date'] = begins.isoformat()
        #metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'Quarterly'

        print metadata['title']
        u = Uploader("pp")

        for resource in metadata['sources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
            resource['name'] = resource['url'].split('/')[-1]
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #5

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def add_metadata_to_ascof_datasets():
    metadata_file = DATA_DIR/'dataset.metadata.json'
    metadata = metadata_file.json_load()

    metadata['tags'] = ['PHOF', 'Public Health Outcomes Framework']
    metadata['title'] ='PHOF - Public Health Outcomes Framework'
    metadata['frequency'] = 'yearly'
    metadata['summary'] = PHOF_SUMMARY
    metadata['source'] = 'http://www.phoutcomes.info/public-health-outcomes-framework'

    metadata['coverage_start_date'] = '2000-01-01'
    metadata['coverage_end_date'] = '2013-12-31'

    u = Uploader("phof")
    for resource in metadata['resources']:
        filename = filename_for_resource(resource)
        path = DATA_DIR / filename

        download_file(resource['url'], path)
        print "Uploading to S3"
        url = u.upload(path)
        resource['url'] = url
    u.close()


    metadata_file.truncate()
    metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #6

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def add_metadata_to_ascof_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework']
        title = metadata['title']
        match = re.search('(\d{4})-(\d{2})', title)
        begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        ends = datetime.date(begins.year + 1, 3, 31)
        metadata['coverage_start_date'] = begins.isoformat()
        metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'yearly'
        metadata['title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(match.group(1), match.group(2))

        u = Uploader("ascof")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #7

0

ファイルを表示

def add_metadata_to_ascof_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['ASCOF', 'Adult Social Care Outcomes Framework']
        title = metadata['title']
        match = re.search('(\d{4})-(\d{2})', title)
        begins = datetime.date(year=int(match.group(1)), month=4, day=1)
        ends = datetime.date(begins.year + 1, 3, 31)
        metadata['coverage_start_date'] = begins.isoformat()
        metadata['coverage_end_date'] = ends.isoformat()
        metadata['frequency'] = 'yearly'
        metadata[
            'title'] = 'ASCOF - Adult Social Care Outcomes Framework, England -{0}-{1}'.format(
                match.group(1), match.group(2))

        u = Uploader("ascof")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #8

0

ファイルを表示

ファイル: transform.py プロジェクト: uk-gov-mirror/nhsengland.publish-o-matic

def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    u = Uploader("gp-survey")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0

コード例 #9

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def add_metadata_to_qof_datasets():
    u = Uploader("nshof")

    f = os.path.join(DATA_DIR, "nhsof_metadata_indicators.json")
    datasets = json.load(open(f))

    for metadata in datasets:
        metadata['tags'] = ['QOF', 'Quality Outcomes Framework']
        title = metadata['title']
        #metadata['frequency'] = 'yearly'
        #metadata['title'] = 'QOF - National Quality Outcomes Framework - {0}-{1}'.format(match.group(1), match.group(2))

        resources = []
        for resource in metadata['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]
            resource['url_type'] = ''

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        metadata['resources'] = resources


    u.close()

    json.dump(datasets, open(os.path.join(DATA_DIR, "nhsof_metadata_indicators.json"), "w"))
    return

コード例 #10

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def add_metadata_to_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['Mental Health']

        u = Uploader("mhmds")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #11

0

ファイルを表示

ファイル: transform.py プロジェクト: uk-gov-mirror/nhsengland.publish-o-matic

def add_metadata_to_datasets():
    for directory, metadata_file, metadata in datasets():
        metadata['tags'] = ['Mental Health']

        u = Uploader("mhmds")
        for resource in metadata['resources']:
            print resource['url']
            filename = filename_for_resource(resource)
            path = directory / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            print resource['url']
        u.close()

        metadata_file.truncate()
        metadata_file << json.dumps(metadata, indent=2)
    return

コード例 #12

0

ファイルを表示

def retrieve_qof_datasets(datasets):
    results = []

    u = Uploader("qof")
    for dataset in datasets:
        print dataset['title']
        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
        results.append(dataset)

    u.close()

    metadata_file = DATA_DIR / 'dataset.metadata.json'
    if metadata_file:
        metadata_file.truncate()
    metadata_file << json.dumps(results, indent=2)

コード例 #13

0

ファイルを表示

ファイル: scrape.py プロジェクト: nhsengland/publish-o-matic

def retrieve_qof_datasets(datasets):
    results = []

    u = Uploader("qof")
    for dataset in datasets:
        print dataset['title']
        for resource in dataset['resources']:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
        results.append(dataset)

    u.close()

    metadata_file = DATA_DIR/'dataset.metadata.json'
    if metadata_file:
        metadata_file.truncate()
    metadata_file << json.dumps(results, indent=2)

コード例 #14

0

ファイルを表示

ファイル: transform.py プロジェクト: uk-gov-mirror/nhsengland.publish-o-matic

def main(workspace):
    DATA_DIR = ffs.Path(workspace)
    datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json'))

    u = Uploader("ccgois")
    for dataset in datasets:
        resources = []
        for resource in dataset['sources']:
            resource['format'] = resource['filetype']
            resource['name'] = resource['url'].split('/')[-1]

            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            download_file(resource['url'], path)
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resources.append(resource)
        dataset['resources'] = resources
    u.close()

    json.dump(datasets, open(DATA_DIR / 'ccgois_indicators.json', 'w'))

コード例 #15

0

ファイルを表示

def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    tag_list = ["Statistics"]
    u = Uploader("stats")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..adding tags"
        tags = dataset.get('tags', [])
        for t in tag_list:
            if not t in tags:
                tags.append(t)
        dataset['tags'] = tags

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            try:
                download_file(resource['url'], path)
            except:
                continue
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resource['url_type'] = ''  # make sure we zap historical uploads

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0

コード例 #16

0

ファイルを表示

ファイル: transform.py プロジェクト: nhsengland/publish-o-matic

def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), 'r'))

    tag_list = ["Statistics"]
    u = Uploader("stats")

    for dataset in datasets:
        print "Processing", dataset['name']

        print "..adding tags"
        tags = dataset.get('tags', [])
        for t in tag_list:
            if not t in tags:
                tags.append(t)
        dataset['tags'] = tags

        print "..fetching resources"
        for resource in dataset["resources"]:
            filename = filename_for_resource(resource)
            path = DATA_DIR / filename

            try:
                download_file(resource['url'], path)
            except:
                continue
            print "Uploading to S3"
            url = u.upload(path)
            resource['url'] = url
            resource['url_type'] = ''  # make sure we zap historical uploads

    u.close()
    json.dump(datasets, open(os.path.join(DATA_DIR, "metadata.json"), 'wb'))

    return 0

コード例 #17

0

ファイルを表示

ファイル: load.py プロジェクト: nhsengland/publish-o-matic

def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'

    org = dgu.action.organization_show(id=TARGET_ORGANISATION)

    if not _org_existsp(TARGET_ORGANISATION):
        catalogue.action.organization_create(
            name=org['name'],
            title=org['title'],
            description=org['description'],
            image_url=org['image_display_url']
        )

    print "Found {0} datasets on source".format(len(org['packages']))

    for package in org['packages']:
        print 'uploading', package['title'].encode('utf8')
        dataset_dir = DATA_DIR/package['name']
        # Get the dataset from DGU
        dataset = dgu.action.package_show(id=package['name'])
        del dataset['id']

        # Set the new owning organisation
        dataset['owner_org'] = org['name']

        u = Uploader("hqip")
        for resource in dataset['resources']:
            resource['name'] = resource['description']
            if resource['format'] == "HTML":
                continue
            if resource['url'].startswith('hhttps'):
                resource['url'] = resource['url'].replace('hhttps', 'https')

            if 'cache_filepath' in resource:
                del resource['cache_filepath']
            if 'tracking_summary' in resource:
                del resource['tracking_summary']

            filename = filename_for_resource(resource)

            datafile = dataset_dir/filename
            print 'downloading', resource['url'], 'as', datafile

            try:
                download_file(resource['url'], datafile)
                print "Uploading to S3"
                url = u.upload(datafile)
                resource['url'] = url
            except:
                print '***' * 30
                print "Failed to download: ", resource['url']
        u.close()

        # Add a nice tag so we can find them all again
        dataset['tags'].append({'name': 'HQIP' })
        print 'Owner org is', org['name']
        try:
            extras = []
            if 'temporal_coverage-from' in dataset:
                extras.append(dict(key='coverage_start_date', value=format_date(dataset['temporal_coverage-from'])))
            if 'temporal_coverage' in dataset:
                extras.append(dict(key='coverage_end_date', value=format_date(dataset['temporal_coverage-to'])))
            if 'frequency' in dataset:
                extras.append(dict(key='frequency', value=dataset['update_frequency']))

            new_dataset = Dataset.create_or_update(
                name=dataset['name'],
                title=dataset['title'],
                state='active',
                visibility='private',
                license_id='uk-ogl',
                notes=dataset['notes'],
                origin=dataset['url'],
                tags=dataset['tags'],
                resources=dataset['resources'],
                owner_org=org['name'],
                extras=extras
            )
            print "Created {}".format(dataset['name'])
        except ValueError as e:
            print 'skipping because error', e
            continue
        except ValidationError:
            raise
            print "Failed to upload {}".format(dataset['name'])

コード例 #18

0

ファイルを表示

class Unzipper(object):
    def unzip(self, package):
        """ Processes each resource in the package, and possibly also
            adds more resources if we manage to extract some. """
        extra_resources = []

        self.uploader = Uploader("unzipped/{}".format(package['name']))

        updated_resources = []
        print " +", package['name']
        for resource in package['resources']:
            if not resource['format'] == 'ZIP':
                updated_resources.append(resource)
                continue

            updated_resources.append(resource)
            print "   +", resource['name']
            extract_zip_to = self.unzip_file(resource['url'])

            print "        + Processing files in ", extract_zip_to

            files = []
            for (dirpath, dirnames, filenames) in os.walk(extract_zip_to):
                files.extend([os.path.join(dirpath, p) for p in filenames])

            for f in files:
                res = self.local_file_to_resource(f, resource)
                if res:
                    updated_resources.append(res)

            shutil.rmtree(extract_zip_to)
        package['resources'] = updated_resources

        self.uploader.close()
        if extra_resources:
            package['resources'].extend(extra_resources)

    def local_file_to_resource(self, local_file, parent_resource):
        print "Adding {} from {}".format(local_file, parent_resource['name'])
        if local_file.lower().endswith('.zip'):
            return None

        filename = local_file.split('/')[-1]
        parent_desc = parent_resource['description']
        if parent_desc == parent_resource['name']:
            parent_desc = ""
        description = u"(Extracted from {}) {}".format(parent_resource['name'],
                                                       parent_desc)
        resource = {
            "description": description.replace('\u00a0', ' '),
            "name": filename,
            "format": filename.split('.')[-1].upper(),
        }

        hash_file = create_hash_file(local_file)
        try:
            url = self.uploader.upload(local_file)
            resource['url'] = url
        except Exception, e:
            print e
            return None

        os.unlink(hash_file)

        return resource

コード例 #19

0

ファイルを表示

def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'

    org = dgu.action.organization_show(id=TARGET_ORGANISATION)

    if not _org_existsp(TARGET_ORGANISATION):
        catalogue.action.organization_create(
            name=org['name'],
            title=org['title'],
            description=org['description'],
            image_url=org['image_display_url'])

    print "Found {0} datasets on source".format(len(org['packages']))

    for package in org['packages']:
        print 'uploading', package['title'].encode('utf8')
        dataset_dir = DATA_DIR / package['name']
        # Get the dataset from DGU
        dataset = dgu.action.package_show(id=package['name'])
        del dataset['id']

        # Set the new owning organisation
        dataset['owner_org'] = org['name']

        u = Uploader("hqip")
        for resource in dataset['resources']:
            resource['name'] = resource['description']
            if resource['format'] == "HTML":
                continue
            if resource['url'].startswith('hhttps'):
                resource['url'] = resource['url'].replace('hhttps', 'https')

            if 'cache_filepath' in resource:
                del resource['cache_filepath']
            if 'tracking_summary' in resource:
                del resource['tracking_summary']

            filename = filename_for_resource(resource)

            datafile = dataset_dir / filename
            print 'downloading', resource['url'], 'as', datafile

            try:
                download_file(resource['url'], datafile)
                print "Uploading to S3"
                url = u.upload(datafile)
                resource['url'] = url
            except:
                print '***' * 30
                print "Failed to download: ", resource['url']
        u.close()

        # Add a nice tag so we can find them all again
        dataset['tags'].append({'name': 'HQIP'})
        print 'Owner org is', org['name']
        try:
            extras = []
            if 'temporal_coverage-from' in dataset:
                extras.append(
                    dict(key='coverage_start_date',
                         value=format_date(dataset['temporal_coverage-from'])))
            if 'temporal_coverage' in dataset:
                extras.append(
                    dict(key='coverage_end_date',
                         value=format_date(dataset['temporal_coverage-to'])))
            if 'frequency' in dataset:
                extras.append(
                    dict(key='frequency', value=dataset['update_frequency']))

            new_dataset = Dataset.create_or_update(
                name=dataset['name'],
                title=dataset['title'],
                state='active',
                visibility='private',
                license_id='uk-ogl',
                notes=dataset['notes'],
                origin=dataset['url'],
                tags=dataset['tags'],
                resources=dataset['resources'],
                owner_org=org['name'],
                extras=extras)
            print "Created {}".format(dataset['name'])
        except ValueError as e:
            print 'skipping because error', e
            continue
        except ValidationError:
            raise
            print "Failed to upload {}".format(dataset['name'])