Esempio n. 1
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace)
    datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json'))
    dc.ensure_publisher('hscic')
    dc.ensure_group('ccgois')
    load_ccgois(datasets)
    group_ccgois(datasets)
Esempio n. 2
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace)
    datasets = json.load(open(DATA_DIR / 'ccgois_indicators.json'))
    dc.ensure_publisher('hscic')
    dc.ensure_group('ccgois')
    load_ccgois(datasets)
    group_ccgois(datasets)
Esempio n. 3
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    dc.ensure_publisher('phe')
    dc.ensure_group('phof')
    load_phe()
    #group_phe()
    return 0
Esempio n. 4
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    dc.ensure_publisher('hscic')
    dc.ensure_group('qof')
    load_qof()
    group_qof()
    return 0
Esempio n. 5
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    dc.ensure_publisher('phe')
    dc.ensure_group('phof')
    load_phe()
    #group_phe()
    return 0
Esempio n. 6
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    dc.ensure_publisher('nhs-england')
    dc.ensure_group('surveys')

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), "r"))
    for dataset in datasets:
        if load_dataset(dataset, DATA_DIR):
            groups(dataset)
Esempio n. 7
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    dc.ensure_publisher('nhs-england')
    dc.ensure_group('statistics')

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), "r"))
    for dataset in datasets:
        if load_statistic(dataset, DATA_DIR):
            groups(dataset)
def groups(dataset):
    groups = dataset.get('groups', [])

    dataset = dc.ckan.action.package_show(id=dataset["name"])
    for grp in groups:
        if [g for g in dataset['groups'] if g['name'].lower() == grp]:
            print 'Already in group', g['name']
        else:
            dc.ensure_group(grp)
            dc.ckan.action.member_create(id=grp,
                                         object=dataset['name'],
                                         object_type='package',
                                         capacity='member')
    return
Esempio n. 9
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(get_resource_file(DATA_DIR / 'nhsof_metadata_indicators.json'))
    print "Ensuring publisher"
    dc.ensure_publisher('hscic')
    print "Ensuring group"
    dc.ensure_group('nhsof')
    wrote = load_nhsof(datasets)
    if wrote:
        group_nhsof(datasets)
    else:
        print "Created/processed no datasets ..."
Esempio n. 10
0
def main(workspace):
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    dc.ensure_publisher('gp-survey')
    dc.ensure_group('surveys')

    def year_as_key(x):
        return x['title'][-4:]

    datasets = json.load(open(os.path.join(DATA_DIR, "metadata.json"), "r"))
    datasets = sorted(datasets, key=year_as_key)
    for dataset in datasets:
        load_statistic(dataset, DATA_DIR)
        groups(dataset)
Esempio n. 11
0
def groups(dataset):
    groups = dataset.get('groups', [])

    dataset = dc.ckan.action.package_show(id=dataset["name"])
    for grp in groups:
        if [g for g in dataset['groups'] if g['name'].lower() == grp]:
            print 'Already in group', g['name']
        else:
            dc.ensure_group(grp)
            dc.ckan.action.member_create(
                id=grp,
                object=dataset['name'],
                object_type='package',
                capacity='member'
            )
    return
Esempio n. 12
0
def main(workspace):
    global DATA_DIR
    DATA_DIR = ffs.Path(workspace) / 'data'
    DATA_DIR.mkdir()

    datasets = json.load(
        get_resource_file(DATA_DIR / 'nhsof_metadata_indicators.json'))
    print "Ensuring publisher"
    dc.ensure_publisher('hscic')
    print "Ensuring group"
    dc.ensure_group('nhsof')
    wrote = load_nhsof(datasets)
    if wrote:
        group_nhsof(datasets)
    else:
        print "Created/processed no datasets ..."
def curate_hscic_indicators():
    """
    Curate it!
    """
    dc.ensure_group('nhsof')
    dc.ensure_group('ccgois')

    indicators = DATA_DIR/'indicators.json'
    data = indicators.json_load()
    for indicator in data:
        if is_indicator(indicator):
            framework = determine_framework(indicator)
            number = indicator['title'].split(' ')[0]
            print framework, number
            unique_id = 'hscic_indicator_'+ indicator['unique identifier'].lower()
            print unique_id
            newname = u'{0} {1}'.format(framework, indicator['title'])
            rename(unique_id, newname)
            add_tag(indicator, framework)
            add_to_group(indicator, framework)
    return
def curate_hscic_indicators():
    """
    Curate it!
    """
    dc.ensure_group('nhsof')
    dc.ensure_group('ccgois')

    indicators = DATA_DIR / 'indicators.json'
    data = indicators.json_load()
    for indicator in data:
        if is_indicator(indicator):
            framework = determine_framework(indicator)
            number = indicator['title'].split(' ')[0]
            print framework, number
            unique_id = 'hscic_indicator_' + indicator[
                'unique identifier'].lower()
            print unique_id
            newname = u'{0} {1}'.format(framework, indicator['title'])
            rename(unique_id, newname)
            add_tag(indicator, framework)
            add_to_group(indicator, framework)
    return
def publish_datasets(start_from=0):
    global DATA_DIR

    u = Uploader("hscic-datasets")

    datasetfile = ffs.Path(get_resource_path('datasets.json'))
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    logging.info('Processing {} indicators'.format(len(datasets)))
    logging.info('Starting from record {}'.format(start_from))

    import random
    total = len(datasets) - start_from
    current = 1

    for dataset in datasets[start_from:]:
        print "STATUS: {}/{}".format(current, total)
        current += 1

        #print u'Processing {}'.format(dataset['title'])
        #print '  ID: {}'.format(dataset['id'])
        try:
            resources = []
            for s in dataset['sources']:
                resource = {
                    "description": s['description'],
                    "name": s['url'].split('/')[-1],
                    "format": s['filetype'],
                    "url": s["url"]
                }
                """
                filename = filename_for_resource(resource)
                path = DATA_DIR / filename
                download_file(resource['url'], path)
                resource['url'] = u.upload(path)
                """
                resources.append(resource)

            if not resources:
                print "Dataset {} does not have any resources".format(
                    dataset['id'])
                continue

            title = dataset['title']

            c = Curator(dataset)
            groups = c.get_groups()
            if not groups:
                print "Not in a group"
                continue

            prefix = c.get_title_prefix()
            if prefix:
                title = u"{} - {}".format(prefix, title)
            name = slugify.slugify(title).lower()[0:99]

            # Call cleantags on each work and expect back a list, which is then flattened

            tags = []
            if 'keywords' in dataset:
                dataset['keywords'] = sum([
                    clean_tag(k)
                    for k in dataset.get('keywords', []) if len(k) > 2
                ], [])
                tags = dc.tags(*dataset['keywords'])

            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\n<h2>KEY FACTS:</h2>\n' + ''.join(
                    dataset['key_facts'])
            notes = to_markdown(notes)

            name = 'hscic_dataset_{}'.format(dataset['id'])

            dc.Dataset.create_or_update(name=name,
                                        title=title,
                                        state='active',
                                        licence_id='ogl',
                                        notes=notes,
                                        url=dataset['source'],
                                        tags=tags,
                                        resources=resources,
                                        owner_org='hscic')

            if groups:
                try:
                    dataset = dc.ckan.action.package_show(id=name)
                except:
                    continue

                for group in groups:
                    group = group.lower()

                    if [
                            g for g in dataset.get('groups', [])
                            if g['name'] == group
                    ]:
                        print 'Already in group', g['name']
                    else:
                        dc.ensure_group(group)
                        dc.ckan.action.member_create(id=group,
                                                     object=dataset['id'],
                                                     object_type='package',
                                                     capacity='member')
        except Exception as ex:
            import traceback
            traceback.print_exc()

    u.close()
    return
Esempio n. 16
0
def publish_datasets(start_from=0):
    datasetfile = DATA_DIR / 'datasets.json'
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    amount = len(datasets)
    logging.info('Processing {} datasets'.format(amount))
    logging.info('Starting from record {}'.format(start_from))
    for dataset in datasets[start_from:10]:
        try:
            logging.info('{} of {}'.format(start_from, amount))
            start_from += 1
            logging.info('Processing {}'.format(dataset['title']))
            logging.info('ID: {}'.format(dataset['id']))
            resources = [
                dict(description=s['description'],
                     name=s['url'].split('/')[-1],
                     format=s['filetype'],
                     upload=dc.fh_for_url(s['url']))
                for s in dataset['sources']
            ]
            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts']
            extras = [
                {
                    'key': 'Public Access Level',
                    'value': 'Public',
                },
                {
                    'key': 'Data Quality Assurance',
                    'value': 'False'
                },
                {
                    'key': 'Status',
                    'value': 'Live',
                },
            ]
            if 'date_range' in dataset:
                extras.append({
                    'key': 'Time period',
                    'value': dataset['date_range'],
                })
            if 'publication_date' in dataset:
                extras.append({
                    'key': 'Release date',
                    'value': dataset['publication_date'],
                })
            if 'geographical_coverage' in dataset:
                extras.append({
                    'key':
                    'Geographical coverage',
                    'value':
                    ', '.join(dataset['geographical_coverage'])
                })
            # groups
            groups = []
            for item in dataset['topics']:
                groups.append(item)
            for item in dataset['information_types']:
                groups.append(item)
            group_faff = []
            for g in groups:
                group_name = dc.ensure_group(g, 'HSCIC')
                group_faff.append({
                    'name': group_name,
                })
            name = 'hscic_dataset_{}'.format(dataset['id'])
            # NHSEngland metadata as comments...
            dc.Dataset.create_or_update(
                name=name,  # Unique ID
                title=dataset['title'],  # title
                notes=notes,  # description
                tags=dc.tags(*dataset['keywords']),  # tags
                extras=extras,
                state='active',
                licence_id='ogl',
                url=dataset['source'],
                resources=resources,
                groups=group_faff,
                owner_org='hscic'  # publisher
            )
        except Exception as ex:
            logging.error(ex)
    return
Esempio n. 17
0
def main():
    dc.ensure_publisher('hscic')
    dc.ensure_group('indicators', 'hscic')
    publish_indicators()
    publish_datasets()
    return 0
def publish_datasets(start_from=0):
    datasetfile = DATA_DIR/'datasets.json'
    logging.info('Loading {}'.format(datasetfile))
    datasets = datasetfile.json_load()
    amount = len(datasets)
    logging.info('Processing {} datasets'.format(amount))
    logging.info('Starting from record {}'.format(start_from))
    for dataset in datasets[start_from:10]:
        try:
            logging.info('{} of {}'.format(start_from, amount))
            start_from += 1
            logging.info('Processing {}'.format(dataset['title']))
            logging.info('ID: {}'.format(dataset['id']))
            resources = [
                dict(
                    description=s['description'],
                    name=s['url'].split('/')[-1],
                    format=s['filetype'],
                    upload=dc.fh_for_url(s['url'])
                )
                for s in dataset['sources']
            ]
            notes = dataset['summary']
            if 'key_facts' in dataset:
                notes += '\n\nKEY FACTS:\n==========\n\n' + dataset['key_facts']
            extras = [
                {'key': 'Public Access Level',
                 'value': 'Public',},
                {'key': 'Data Quality Assurance',
                 'value': 'False'},
                {'key': 'Status',
                 'value': 'Live',},
            ]
            if 'date_range' in dataset:
                extras.append({
                    'key': 'Time period',
                    'value': dataset['date_range'],
                })
            if 'publication_date' in dataset:
                extras.append({
                    'key': 'Release date',
                    'value': dataset['publication_date'],
                })
            if 'geographical_coverage' in dataset:
                extras.append({
                    'key': 'Geographical coverage',
                    'value': ', '.join(dataset['geographical_coverage'])
                })
            # groups
            groups = []
            for item in dataset['topics']:
                groups.append(item)
            for item in dataset['information_types']:
                groups.append(item)
            group_faff = []
            for g in groups:
                group_name = dc.ensure_group(g, 'HSCIC')
                group_faff.append({
                    'name': group_name,
                })
            name = 'hscic_dataset_{}'.format(dataset['id'])
            # NHSEngland metadata as comments...
            dc.Dataset.create_or_update(
                name=name, # Unique ID
                title=dataset['title'], # title
                notes=notes, # description
                tags=dc.tags(*dataset['keywords']), # tags
                extras=extras,
                state='active',
                licence_id='ogl',
                url=dataset['source'],
                resources=resources,
                groups=group_faff,
                owner_org='hscic' # publisher
            )
        except Exception as ex:
            logging.error(ex)
    return
def main():
    dc.ensure_publisher('hscic')
    dc.ensure_group('indicators', 'hscic')
    publish_indicators()
    publish_datasets()
    return 0