def fetch_data(self, storage):
        self.logger.info("Fetching data from Ckan at {0}".format(self.url))
        client = CkanHighlevelClient(
            self.url, api_key=self.conf.get('api_key'))

        for dataset in client.iter_datasets():
            self.logger.info("Dataset: {0}".format(dataset.id))
            storage.set_object('dataset', dataset.id, dataset.serialize())

        for group in client.iter_groups():
            self.logger.info("Group: {0}".format(group.name))
            storage.set_object('group', group.name, group.serialize())

        for organization in client.iter_organizations():
            self.logger.info("Organization: {0}".format(organization.name))
            storage.set_object('organization', organization.name,
                               organization.serialize())
def download_and_print_ckan_datasets(ckan_url):
    """
    Download and print datasets from ckan
    """

    client = CkanHighlevelClient(ckan_url)

    logger = logging.getLogger('ckan_crawl_demo')
    logger.info('Starting function')

    total = len(client.list_datasets())
    current_app.report_progress(None, 0, total)

    for cnt, dataset in enumerate(client.iter_datasets()):
        logger.debug(repr(dataset))
        current_app.report_progress(None, cnt + 1, total)

    return total
Beispiel #3
0
    def __init__(self, base_url, api_key=None, **kw):
        """
        :param base_url:
            Base URL of the Ckan instance, passed to high-level client

        :param api_key:
            API key to be used, passed to high-level client

        :param organization_merge_strategy: One of:

            - 'create' (default) if the organization doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the organization doesn't exist, create it.
              Otherwise, update with new values.

        :param group_merge_strategy: One of:

            - 'create' (default) if the group doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the group doesn't exist, create it.
              Otherwise, update with new values.

        :param dataset_preserve_names:
            if ``True`` (the default) will preserve old names of existing
            datasets

        :param dataset_preserve_organization:
            if ``True`` (the default) will preserve old organizations of
            existing datasets.

        :param dataset_group_merge_strategy:
            - 'add' add groups, keep old ones (default)
            - 'replace' replace all existing groups
            - 'preserve' leave groups alone
        """
        self._client = CkanHighlevelClient(base_url, api_key)
        self._conf = {
            'organization_merge_strategy': 'create',
            'group_merge_strategy': 'create',
            'dataset_preserve_names': True,
            'dataset_preserve_organization': True,
            'dataset_group_merge_strategy': 'add',
        }
        self._conf.update(kw)
def test_merge_organizations(ckan_client_arguments):
    args = ckan_client_arguments
    client = CkanHighlevelClient(*args[0], **args[1])
    sync_client = SynchronizationClient(*args[0], **args[1])

    # Create a couple initial organizations
    # ------------------------------------------------------------

    client.create_organization(CkanOrganization(
        {'name': 'tmo-1', 'title': 'TMO 1'}))
    client.create_organization(CkanOrganization(
        {'name': 'tmo-2', 'title': 'TMO 2'}))

    # Test merging with "create" strategy
    # ------------------------------------------------------------

    data = {'organization': {
        'tmo-2': {'name': 'tmo-2', 'title': 'TMO 2.1'},
        'tmo-3': {'name': 'tmo-3', 'title': 'TMO 3.1'},
    }, 'group': {}, 'dataset': {}}

    sync_client._conf['organization_merge_strategy'] = 'create'
    sync_client.sync('test_merge_organizations', data)

    assert client.get_organization_by_name('tmo-1').title == 'TMO 1'
    assert client.get_organization_by_name('tmo-2').title == 'TMO 2'
    assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1'

    # Test merging with "update" strategy
    # ------------------------------------------------------------

    data = {'organization': {
        'tmo-2': {'name': 'tmo-2', 'title': 'TMO 2.2'},
        'tmo-4': {'name': 'tmo-4', 'title': 'TMO 4.2'},
    }, 'group': {}, 'dataset': {}}

    sync_client._conf['organization_merge_strategy'] = 'update'
    sync_client.sync('test_merge_organizations', data)

    assert client.get_organization_by_name('tmo-1').title == 'TMO 1'
    assert client.get_organization_by_name('tmo-2').title == 'TMO 2.2'
    assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1'
    assert client.get_organization_by_name('tmo-4').title == 'TMO 4.2'
    def __init__(self, base_url, api_key=None, **kw):
        """
        :param base_url:
            Base URL of the Ckan instance, passed to high-level client

        :param api_key:
            API key to be used, passed to high-level client

        :param organization_merge_strategy: One of:

            - 'create' (default) if the organization doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the organization doesn't exist, create it.
              Otherwise, update with new values.

        :param group_merge_strategy: One of:

            - 'create' (default) if the group doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the group doesn't exist, create it.
              Otherwise, update with new values.

        :param dataset_preserve_names:
            if ``True`` (the default) will preserve old names of existing
            datasets

        :param dataset_preserve_organization:
            if ``True`` (the default) will preserve old organizations of
            existing datasets.

        :param dataset_group_merge_strategy:
            - 'add' add groups, keep old ones (default)
            - 'replace' replace all existing groups
            - 'preserve' leave groups alone
        """
        self._client = CkanHighlevelClient(base_url, api_key)
        self._conf = {
            'organization_merge_strategy': 'create',
            'group_merge_strategy': 'create',
            'dataset_preserve_names': True,
            'dataset_preserve_organization': True,
            'dataset_group_merge_strategy': 'add',
        }
        self._conf.update(kw)
Beispiel #6
0
def test_merge_strategies(ckan_client_arguments):
    args = ckan_client_arguments
    client = CkanHighlevelClient(*args[0], **args[1])
    sync_client = SynchronizationClient(*args[0], **args[1])
    data = copy.deepcopy(SAMPLE_DATA)

    # Sync data -- should create new datasets only
    sync_client.sync('test_merge', data)

    assert client.get_dataset_by_name('dataset-1').title == 'Dataset #1'
    assert client.get_organization_by_name(
        'org-1').title == 'Organization #1'  # noqa
    assert client.get_group_by_name('grp-1').title == 'Group #1'  # noqa

    # Make sure we preserve names if told so
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_names'] = True
    data['dataset']['dataset-1']['name'] = 'dummy-dataset-one'
    data['dataset']['dataset-1']['title'] = 'Dataset #1.1'
    sync_client.sync('test_merge', data)

    dataset = client.get_dataset_by_name('dataset-1')
    assert dataset.name == 'dataset-1'
    assert dataset.title == 'Dataset #1.1'

    # Make sure we update names if told so
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_names'] = False
    data['dataset']['dataset-1']['name'] = 'dummy-dataset-one'
    data['dataset']['dataset-1']['title'] = 'Dataset #1.2'
    sync_client.sync('test_merge', data)

    with pytest.raises(HTTPError) as excinfo:
        # It got renamed!
        client.get_dataset_by_name('dataset-1')
    assert excinfo.value.status_code == 404

    # Get using the old id
    dataset = client.get_dataset(dataset.id)
    assert dataset.name == 'dummy-dataset-one'
    assert dataset.title == 'Dataset #1.2'

    # Get using the new name
    dataset = client.get_dataset_by_name('dummy-dataset-one')
    assert dataset.name == 'dummy-dataset-one'
    assert dataset.title == 'Dataset #1.2'

    # Prepare for merging groups
    # ============================================================

    grp1_id = client.get_group_by_name('grp-1').id
    grp2_id = client.get_group_by_name('grp-2').id
    # grp3_id = client.get_group_by_name('grp-3').id

    # Merge groups with 'replace' strategy
    # ------------------------------------------------------------

    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id, grp2_id])

    sync_client._conf['dataset_group_merge_strategy'] = 'replace'
    data['dataset']['dataset-2']['groups'] = ['grp-1']

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id])

    # Merge groups with 'add' strategy
    # ------------------------------------------------------------

    sync_client._conf['dataset_group_merge_strategy'] = 'add'
    data['dataset']['dataset-2']['groups'] = ['grp-2']

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id, grp2_id])

    # Merge groups with 'preserve' strategy
    # ------------------------------------------------------------

    sync_client._conf['dataset_group_merge_strategy'] = 'preserve'
    data['dataset']['dataset-2']['groups'] = ['grp-3']

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id, grp2_id])

    # Prepare for merging Organizations
    # ============================================================

    org1_id = client.get_organization_by_name('org-1').id
    org2_id = client.get_organization_by_name('org-2').id

    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.owner_org == org1_id

    # Update preserving organization
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_organization'] = True
    data['dataset']['dataset-2']['owner_org'] = 'org-2'

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.owner_org == org1_id

    # Update *not* preserving organization
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_organization'] = False
    data['dataset']['dataset-2']['owner_org'] = 'org-2'

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.owner_org == org2_id
Beispiel #7
0
def test_merge_organizations(ckan_client_arguments):
    args = ckan_client_arguments
    client = CkanHighlevelClient(*args[0], **args[1])
    sync_client = SynchronizationClient(*args[0], **args[1])

    # Create a couple initial organizations
    # ------------------------------------------------------------

    client.create_organization(
        CkanOrganization({
            'name': 'tmo-1',
            'title': 'TMO 1'
        }))
    client.create_organization(
        CkanOrganization({
            'name': 'tmo-2',
            'title': 'TMO 2'
        }))

    # Test merging with "create" strategy
    # ------------------------------------------------------------

    data = {
        'organization': {
            'tmo-2': {
                'name': 'tmo-2',
                'title': 'TMO 2.1'
            },
            'tmo-3': {
                'name': 'tmo-3',
                'title': 'TMO 3.1'
            },
        },
        'group': {},
        'dataset': {}
    }

    sync_client._conf['organization_merge_strategy'] = 'create'
    sync_client.sync('test_merge_organizations', data)

    assert client.get_organization_by_name('tmo-1').title == 'TMO 1'
    assert client.get_organization_by_name('tmo-2').title == 'TMO 2'
    assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1'

    # Test merging with "update" strategy
    # ------------------------------------------------------------

    data = {
        'organization': {
            'tmo-2': {
                'name': 'tmo-2',
                'title': 'TMO 2.2'
            },
            'tmo-4': {
                'name': 'tmo-4',
                'title': 'TMO 4.2'
            },
        },
        'group': {},
        'dataset': {}
    }

    sync_client._conf['organization_merge_strategy'] = 'update'
    sync_client.sync('test_merge_organizations', data)

    assert client.get_organization_by_name('tmo-1').title == 'TMO 1'
    assert client.get_organization_by_name('tmo-2').title == 'TMO 2.2'
    assert client.get_organization_by_name('tmo-3').title == 'TMO 3.1'
    assert client.get_organization_by_name('tmo-4').title == 'TMO 4.2'
import urllib2
from ckan_api_client.tests.conftest import data_dir
import ssl
__author__ = 'janci'
from ckan_api_client.high_level import CkanHighlevelClient
from ckan_api_client.objects import CkanDataset
client = CkanHighlevelClient('http://192.168.128.19', api_key='48155aab-f1c0-4cfc-96db-a3530de09acc')

datasets = client.list_datasets();
for dataset in datasets:
    dataset = client.get_dataset(dataset)

    for resource in dataset.resources:
        proxy_support = urllib2.ProxyHandler({"http":"http://proxy.in.eea.sk:3128"})
        opener = urllib2.build_opener(proxy_support)
        urllib2.install_opener(opener)
        try:
         html = urllib2.urlopen(resource.url).read()
        except Exception as e:
            print dataset.name, resource.url, e
Beispiel #9
0
def test_dataset_import_export(ckan_instance):
    api_key = ckan_instance.get_sysadmin_api_key()

    with ckan_instance.serve():
        client = CkanHighlevelClient(ckan_instance.server_url, api_key=api_key)
        assert client.list_datasets() == []
def test_merge_strategies(ckan_client_arguments):
    args = ckan_client_arguments
    client = CkanHighlevelClient(*args[0], **args[1])
    sync_client = SynchronizationClient(*args[0], **args[1])
    data = copy.deepcopy(SAMPLE_DATA)

    # Sync data -- should create new datasets only
    sync_client.sync('test_merge', data)

    assert client.get_dataset_by_name('dataset-1').title == 'Dataset #1'
    assert client.get_organization_by_name('org-1').title == 'Organization #1'  # noqa
    assert client.get_group_by_name('grp-1').title == 'Group #1'  # noqa

    # Make sure we preserve names if told so
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_names'] = True
    data['dataset']['dataset-1']['name'] = 'dummy-dataset-one'
    data['dataset']['dataset-1']['title'] = 'Dataset #1.1'
    sync_client.sync('test_merge', data)

    dataset = client.get_dataset_by_name('dataset-1')
    assert dataset.name == 'dataset-1'
    assert dataset.title == 'Dataset #1.1'

    # Make sure we update names if told so
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_names'] = False
    data['dataset']['dataset-1']['name'] = 'dummy-dataset-one'
    data['dataset']['dataset-1']['title'] = 'Dataset #1.2'
    sync_client.sync('test_merge', data)

    with pytest.raises(HTTPError) as excinfo:
        # It got renamed!
        client.get_dataset_by_name('dataset-1')
    assert excinfo.value.status_code == 404

    # Get using the old id
    dataset = client.get_dataset(dataset.id)
    assert dataset.name == 'dummy-dataset-one'
    assert dataset.title == 'Dataset #1.2'

    # Get using the new name
    dataset = client.get_dataset_by_name('dummy-dataset-one')
    assert dataset.name == 'dummy-dataset-one'
    assert dataset.title == 'Dataset #1.2'

    # Prepare for merging groups
    # ============================================================

    grp1_id = client.get_group_by_name('grp-1').id
    grp2_id = client.get_group_by_name('grp-2').id
    # grp3_id = client.get_group_by_name('grp-3').id

    # Merge groups with 'replace' strategy
    # ------------------------------------------------------------

    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id, grp2_id])

    sync_client._conf['dataset_group_merge_strategy'] = 'replace'
    data['dataset']['dataset-2']['groups'] = ['grp-1']

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id])

    # Merge groups with 'add' strategy
    # ------------------------------------------------------------

    sync_client._conf['dataset_group_merge_strategy'] = 'add'
    data['dataset']['dataset-2']['groups'] = ['grp-2']

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id, grp2_id])

    # Merge groups with 'preserve' strategy
    # ------------------------------------------------------------

    sync_client._conf['dataset_group_merge_strategy'] = 'preserve'
    data['dataset']['dataset-2']['groups'] = ['grp-3']

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.groups == set([grp1_id, grp2_id])

    # Prepare for merging Organizations
    # ============================================================

    org1_id = client.get_organization_by_name('org-1').id
    org2_id = client.get_organization_by_name('org-2').id

    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.owner_org == org1_id

    # Update preserving organization
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_organization'] = True
    data['dataset']['dataset-2']['owner_org'] = 'org-2'

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.owner_org == org1_id

    # Update *not* preserving organization
    # ------------------------------------------------------------

    sync_client._conf['dataset_preserve_organization'] = False
    data['dataset']['dataset-2']['owner_org'] = 'org-2'

    sync_client.sync('test_merge', data)
    dataset = client.get_dataset_by_name('dataset-2')
    assert dataset.owner_org == org2_id
class SynchronizationClient(object):
    """
    Synchronization client, providing functionality for importing
    collections of datasets into a Ckan instance.

    Synchronization acts as follows:

    - Snsure all the required organizations/groups are there;
      create a map between "source" ids and Ckan ids.
      Optionally update existing organizations/groups with
      new details.

    - Find all the Ckan datasets matching the ``source_name``

    - Determine which datasets...

      - ...need to be created
      - ...need to be updated
      - ...need to be deleted

    - First, delete datasets to be deleted in order to free up names

    - Then, create datasets that need to be created

    - Lastly, update datasets using the configured merge strategy
      (see constructor arguments).
    """

    def __init__(self, base_url, api_key=None, **kw):
        """
        :param base_url:
            Base URL of the Ckan instance, passed to high-level client

        :param api_key:
            API key to be used, passed to high-level client

        :param organization_merge_strategy: One of:

            - 'create' (default) if the organization doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the organization doesn't exist, create it.
              Otherwise, update with new values.

        :param group_merge_strategy: One of:

            - 'create' (default) if the group doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the group doesn't exist, create it.
              Otherwise, update with new values.

        :param dataset_preserve_names:
            if ``True`` (the default) will preserve old names of existing
            datasets

        :param dataset_preserve_organization:
            if ``True`` (the default) will preserve old organizations of
            existing datasets.

        :param dataset_group_merge_strategy:
            - 'add' add groups, keep old ones (default)
            - 'replace' replace all existing groups
            - 'preserve' leave groups alone
        """
        self._client = CkanHighlevelClient(base_url, api_key)
        self._conf = {
            'organization_merge_strategy': 'create',
            'group_merge_strategy': 'create',
            'dataset_preserve_names': True,
            'dataset_preserve_organization': True,
            'dataset_group_merge_strategy': 'add',
        }
        self._conf.update(kw)

    def sync(self, source_name, data):
        """
        Synchronize data from a source into Ckan.

        - datasets are matched by _harvest_source
        - groups and organizations are matched by name

        :param source_name:
            String identifying the source of the data. Used to build
            ids that will be used in further synchronizations.
        :param data:
            Data to be synchronized. Should be a dict (or dict-like)
            with top level keys coresponding to the object type,
            mapping to dictionaries of ``{'id': <object>}``.
        """

        groups = dict(
            (key, CkanGroup(val))
            for key, val in data['group'].iteritems())

        organizations = dict(
            (key, CkanOrganization(val))
            for key, val in data['organization'].iteritems())

        # Upsert groups and organizations
        groups_map = self._upsert_groups(groups)
        orgs_map = self._upsert_organizations(organizations)

        # Create list of datasets to be synced
        logger.info('Creating list of datasets to be synchronized')
        source_datasets = {}
        for source_id, dataset_dict in data['dataset'].iteritems():
            _dataset_dict = copy.deepcopy(dataset_dict)

            # We need to make sure "source" datasets
            # don't have (otherwise misleading) ids
            _dataset_dict.pop('id', None)

            # We need to update groups and organizations,
            # to map their name from the source into a
            # ckan id
            _dataset_dict['groups'] = [
                groups_map.to_ckan(grp_id)
                for grp_id in _dataset_dict['groups']
            ]
            _dataset_dict['owner_org'] = \
                orgs_map.to_ckan(_dataset_dict['owner_org'])

            dataset = CkanDataset(_dataset_dict)

            # We also want to add the "source id", used for further
            # synchronizations to find stuff
            dataset.extras[HARVEST_SOURCE_ID_FIELD] = \
                self._join_source_id(source_name, source_id)

            source_datasets[source_id] = dataset

        # Retrieve list of datasets from Ckan
        logger.info('Retrieving current status from Ckan')
        ckan_datasets = self._find_datasets_by_source(source_name)

        # Compare collections to find differences
        differences = self._compare_collections(
            ckan_datasets, source_datasets)

        # ------------------------------------------------------------
        # We now need to create/update/delete datasets.

        # todo: we need to make sure dataset names are not
        # already used by another dataset. The only
        # way is to randomize resource names and hope
        # a 409 response indicates duplicate name..

        # _progress_total = sum(len(differences[x])
        #                       for x in ('left', 'right', 'differing'))
        # _progress_next = itertools.count(1).next
        # report_progress(0, _progress_total)

        _prog_tot_add = len(differences['right'])
        _prog_next_add = itertools.count(1).next
        _prog_tot_remove = len(differences['left'])
        _prog_next_remove = itertools.count(1).next
        _prog_tot_update = len(differences['differing'])
        _prog_next_update = itertools.count(1).next

        # Create progress bars early..
        report_progress(('datasets', 'delete'), 0, _prog_tot_remove)
        report_progress(('datasets', 'create'), 0, _prog_tot_add)
        report_progress(('datasets', 'update'), 0, _prog_tot_update)

        # We delete first, in order to (possibly) deallocate
        # some already-used names..
        for source_id in differences['left']:
            ckan_id = ckan_datasets[source_id].id
            logger.info('Deleting dataset {0}'.format(ckan_id))
            self._client.delete_dataset(ckan_id)
            report_progress(('datasets', 'delete'),
                            _prog_next_remove(), _prog_tot_remove)

        def force_dataset_operation(operation, dataset, retry=5):
            # Maximum dataset name length is 100 characters
            # We trim it down to 80 just to be safe.

            # Note: we generally want to preserve the original name
            #       and there should *never* be problems with that
            #       when updating..

            _orig_name = dataset.name[:80]
            dataset.name = _orig_name

            while True:
                try:
                    result = operation(dataset)
                except HTTPError, e:
                    if e.status_code != 409:
                        raise
                    retry -= 1
                    if retry < 0:
                        raise
                    dataset.name = '{0}-{1:06d}'.format(
                        _orig_name,
                        random.randint(0, 999999))
                    logger.debug('Got 409: trying to rename dataset to {0}'
                                 .format(dataset.name))
                else:
                    return result

        # Create missing datasets
        for source_id in differences['right']:
            logger.info('Creating dataset {0}'.format(source_id))
            dataset = source_datasets[source_id]
            force_dataset_operation(self._client.create_dataset, dataset)
            report_progress(('datasets', 'create'),
                            _prog_next_add(), _prog_tot_add)

        # Update outdated datasets
        for source_id in differences['differing']:
            logger.info('Updating dataset {0}'.format(source_id))
            # dataset = source_datasets[source_id]
            old_dataset = ckan_datasets[source_id]
            new_dataset = source_datasets[source_id]
            dataset = self._merge_datasets(old_dataset, new_dataset)
            dataset.id = old_dataset.id  # Mandatory!
            self._client.update_dataset(dataset)  # should never fail!
            report_progress(('datasets', 'update'),
                            _prog_next_update(), _prog_tot_update)
Beispiel #12
0
from ckan_api_client.high_level import CkanHighlevelClient
from ckan_api_client.objects import CkanDataset

API_Key = "cdc0284b-47c9-48ff-8a17-5861259c5a03"
# ckan_url = "https://demo.ckan.org"
ckan_url = "http://localhost:5000"
# ckan_url = "http://ckan.dev.pfe.co.nz"
ua = 'ckanapiexample/1.0 (+http://pfr.co.nz/)'

client = CkanHighlevelClient(ckan_url, api_key=API_Key)

# Put the details of the dataset we're going to create into a dict.
dataset_dict = {
    "name": "lure-dispenser-comparison-trial2",
    "title": "Lure Dispenser comparison trial, thrips, Australia, Perth",
    "notes":
    "Assess three thrips Lure delivery mechanisms:\n\n* P Paint pen\n* D Deer wick\n* C Control (no wick)",
    "private": False,
    "owner_org": "plant-and-food-research-nz",
    "author": "Mette Nielson",
}

dataset_dict2 = {
    "name": "lure-dispenser-comparison-trial",
    "title": "Lure Dispenser comparison trial, thrips, Australia, Perth",
    "notes":
    "Assess three thrips Lure delivery mechanisms:\n\n* P Paint pen\n* D Deer wick\n* C Control (no wick)",
    "private": False,
    "owner_org": "plant-and-food-research-nz",
    "state": "active",
    "project_code": "P/1234",
print("Update:")
for ds, (s_name, s_id) in sorted(datasets_to_associate.iteritems()):
    print('    {0} -> {1}:{2}'.format(ds, s_name, s_id))
print()

print("Delete:")
for ds in datasets_to_delete:
    print('    {0}'.format(ds))
print()

resp = raw_input('Confirm? [y/N] ')
if resp.lower() != 'y':
    print("Aborted.")
    sys.exit(1)

client = CkanHighlevelClient(TARGET_CKAN_URL, api_key=TARGET_CKAN_APIKEY)

for ds, (s_name, s_id) in sorted(datasets_to_associate.iteritems()):
    print('Update {0}: source={1}:{2}'.format(ds, s_name, s_id))
    dataset = client.get_dataset(ds)
    dataset.extras[HARVEST_SOURCE_ID_FIELD] = '{0}:{1}'.format(s_name, s_id)
    client.update_dataset(dataset)

for ds in datasets_to_delete:
    print('Delete: {0}'.format(ds))
    client.delete_dataset(ds)

print()
print("Ok, now you can run the harvester to import stuff.")
print("Good luck!")
Beispiel #14
0
class SynchronizationClient(object):
    """
    Synchronization client, providing functionality for importing
    collections of datasets into a Ckan instance.

    Synchronization acts as follows:

    - Snsure all the required organizations/groups are there;
      create a map between "source" ids and Ckan ids.
      Optionally update existing organizations/groups with
      new details.

    - Find all the Ckan datasets matching the ``source_name``

    - Determine which datasets...

      - ...need to be created
      - ...need to be updated
      - ...need to be deleted

    - First, delete datasets to be deleted in order to free up names

    - Then, create datasets that need to be created

    - Lastly, update datasets using the configured merge strategy
      (see constructor arguments).
    """

    def __init__(self, base_url, api_key=None, **kw):
        """
        :param base_url:
            Base URL of the Ckan instance, passed to high-level client

        :param api_key:
            API key to be used, passed to high-level client

        :param organization_merge_strategy: One of:

            - 'create' (default) if the organization doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the organization doesn't exist, create it.
              Otherwise, update with new values.

        :param group_merge_strategy: One of:

            - 'create' (default) if the group doesn't exist, create it.
              Otherwise, leave it alone.
            - 'update' if the group doesn't exist, create it.
              Otherwise, update with new values.

        :param dataset_preserve_names:
            if ``True`` (the default) will preserve old names of existing
            datasets

        :param dataset_preserve_organization:
            if ``True`` (the default) will preserve old organizations of
            existing datasets.

        :param dataset_group_merge_strategy:
            - 'add' add groups, keep old ones (default)
            - 'replace' replace all existing groups
            - 'preserve' leave groups alone
        """
        self._client = CkanHighlevelClient(base_url, api_key)
        self._conf = {
            'organization_merge_strategy': 'create',
            'group_merge_strategy': 'create',
            'dataset_preserve_names': True,
            'dataset_preserve_organization': True,
            'dataset_group_merge_strategy': 'add',
        }
        self._conf.update(kw)

    def sync(self, source_name, data):
        """
        Synchronize data from a source into Ckan.

        - datasets are matched by _harvest_source
        - groups and organizations are matched by name

        :param source_name:
            String identifying the source of the data. Used to build
            ids that will be used in further synchronizations.
        :param data:
            Data to be synchronized. Should be a dict (or dict-like)
            with top level keys coresponding to the object type,
            mapping to dictionaries of ``{'id': <object>}``.
        """

        groups = dict(
            (key, CkanGroup(val))
            for key, val in data['group'].iteritems())

        organizations = dict(
            (key, CkanOrganization(val))
            for key, val in data['organization'].iteritems())

        # Upsert groups and organizations
        groups_map = self._upsert_groups(groups)
        orgs_map = self._upsert_organizations(organizations)

        # Create list of datasets to be synced
        source_datasets = {}
        for source_id, dataset_dict in data['dataset'].iteritems():
            _dataset_dict = copy.deepcopy(dataset_dict)

            # We need to make sure "source" datasets
            # don't have (otherwise misleading) ids
            _dataset_dict.pop('id', None)

            # We need to update groups and organizations,
            # to map their name from the source into a
            # ckan id
            _dataset_dict['groups'] = [
                groups_map.to_ckan(grp_id)
                for grp_id in _dataset_dict['groups']
            ]
            _dataset_dict['owner_org'] = \
                orgs_map.to_ckan(_dataset_dict['owner_org'])

            dataset = CkanDataset(_dataset_dict)

            # We also want to add the "source id", used for further
            # synchronizations to find stuff
            dataset.extras[HARVEST_SOURCE_ID_FIELD] = \
                self._join_source_id(source_name, source_id)

            source_datasets[source_id] = dataset

        # Retrieve list of datasets from Ckan
        ckan_datasets = self._find_datasets_by_source(source_name)

        # Compare collections to find differences
        differences = self._compare_collections(
            ckan_datasets, source_datasets)

        # ------------------------------------------------------------
        # We now need to create/update/delete datasets.

        # todo: we need to make sure dataset names are not
        # already used by another dataset. The only
        # way is to randomize resource names and hope
        # a 409 response indicates duplicate name..

        # We delete first, in order to (possibly) deallocate
        # some already-used names..
        for source_id in differences['left']:
            ckan_id = ckan_datasets[source_id].id
            logger.info('Deleting dataset {0}'.format(ckan_id))
            self._client.delete_dataset(ckan_id)

        def force_dataset_operation(operation, dataset, retry=5):
            # Maximum dataset name length is 100 characters
            # We trim it down to 80 just to be safe.

            # Note: we generally want to preserve the original name
            #       and there should *never* be problems with that
            #       when updating..

            _orig_name = dataset.name[:80]
            dataset.name = _orig_name

            while True:
                try:
                    result = operation(dataset)
                except HTTPError, e:
                    if e.status_code != 409:
                        raise
                    retry -= 1
                    if retry < 0:
                        raise
                    dataset.name = '{0}-{1:06d}'.format(
                        _orig_name,
                        random.randint(0, 999999))
                    logger.debug('Got 409: trying to rename dataset to {0}'
                                 .format(dataset.name))
                else:
                    return result

        # Create missing datasets
        for source_id in differences['right']:
            logger.info('Creating dataset {0}'.format(source_id))
            dataset = source_datasets[source_id]
            force_dataset_operation(self._client.create_dataset, dataset)

        # Update outdated datasets
        for source_id in differences['differing']:
            logger.info('Updating dataset {0}'.format(source_id))
            # dataset = source_datasets[source_id]
            old_dataset = ckan_datasets[source_id]
            new_dataset = source_datasets[source_id]
            dataset = self._merge_datasets(old_dataset, new_dataset)
            dataset.id = old_dataset.id  # Mandatory!
            self._client.update_dataset(dataset)  # should never fail!