Esempio n. 1
0
def update_extents():
    from ckan.model import PackageExtra, Package, Session
    conn = Session.connection()
    packages = [extra.package \
                for extra in \
                Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()]

    errors = []
    count = 0
    for package in packages:
        try:
            value = package.extras['spatial']
            log.debug('Received: %r' % value)
            geometry = json.loads(value)

            count += 1
        except ValueError as e:
            errors.append(u'Package %s - Error decoding JSON object: %s' %
                          (package.id, six.text_type(e)))
        except TypeError as e:
            errors.append(u'Package %s - Error decoding JSON object: %s' %
                          (package.id, six.text_type(e)))

        save_package_extent(package.id, geometry)

    Session.commit()

    if errors:
        msg = 'Errors were found:\n%s' % '\n'.join(errors)
        print(msg)

    msg = "Done. Extents generated for %i out of %i packages" % (count,
                                                                 len(packages))

    print(msg)
Esempio n. 2
0
    def _execute_sql(cls, script):
        engine = create_engine(cls.sqlalchemy_url)
        Session.bind = engine
        
        connection = Session.connection()

        connection.execute(script)
        Session.commit() 
def _execute_script(script_path):

    conn = Session.connection()
    script = open(script_path, 'r').read()
    for cmd in script.split(';'):
        cmd = re.sub(r'--(.*)|[\n\t]', '', cmd)
        if len(cmd):
            conn.execute(cmd)

    Session.commit()
Esempio n. 4
0
def _execute_script(script_path):

    conn = Session.connection()
    script = open(script_path, "r").read()
    for cmd in script.split(";"):
        cmd = re.sub(r"--(.*)|[\n\t]", "", cmd)
        if len(cmd):
            conn.execute(cmd)

    Session.commit()
def setup_postgis_tables():

    conn = Session.connection()
    script_path = os.path.join(os.path.dirname(os.path.abspath( __file__ )), 'scripts', 'postgis.sql')
    script = open(script_path,'r').read()
    for cmd in script.split(';'):
        cmd = re.sub(r'--(.*)|[\n\t]','',cmd)
        if len(cmd):
            conn.execute(cmd)

    Session.commit()
Esempio n. 6
0
def setup(srid=None):

    if not srid:
        srid = DEFAULT_SRID

    srid = str(srid)

    connection = Session.connection()
    connection.execute('CREATE TABLE package_extent(package_id text PRIMARY KEY)')

    connection.execute('SELECT AddGeometryColumn(\'package_extent\',\'the_geom\', %s, \'GEOMETRY\', 2)',srid)

    Session.commit()
Esempio n. 7
0
def delete_vocabulary(id, cascade=True):
    """
    Delete a vocabulary, by id

    :param id: vocabulary id
    :param cascade: if True, delete all tags in this vocabulary first
    """
    conn = Session.connection()
    with conn.begin():
        if cascade:
            query = delete(tag_table).where(tag_table.c.vocabulary_id == id)
            query.execute()
        query = delete(vocabulary_table).where(vocabulary_table.c.id == id)
 def get_local_datasets_for_portal(self, context, original_portal):
     log.info(">>>> Got portal: "+original_portal)
     conn = Session.connection()
     package_table = self.table('package')
     package_extras_table = self.table('package_extra')
     #select name from package where id in (select package_id from package_extra where (value='"http://daten.rlp.de"' AND package_id in (SELECT id from package where state='active')));
     get_active_packages = select([package_table.c.id]).where(package_table.c.state=='active')
     filtered = select([package_extras_table.c.package_id]).where(and_(package_extras_table.c.key=='metadata_original_portal',and_(package_extras_table.c.value==original_portal,package_extras_table.c.package_id.in_(get_active_packages))))
     get_names_of_filtered = select([package_table.c.name]).where(package_table.c.id.in_(filtered))
     
     result = model.Session.execute(get_names_of_filtered).fetchall()
     results = [row['name'] for row in result]
     log.info('Found %d Datasets for Portal'  %len(results))
     return results
def _execute_script(script_path):
    '''

    :param script_path: 

    '''

    conn = Session.connection()
    script = open(script_path, u'r').read()
    for cmd in script.split(u';'):
        cmd = re.sub(r'--(.*)|[\n\t]', u'', cmd)
        if len(cmd):
            conn.execute(cmd)

    Session.commit()
Esempio n. 10
0
    def update_extents(self):
        from ckan.model import PackageExtra, Package, Session
        conn = Session.connection()
        packages = [extra.package \
                    for extra in \
                    Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()]

        errors = []
        count = 0
        for package in packages:
            try:
                value = package.extras['spatial']
                log.debug('Received: %r' % value)
                geometry = json.loads(value)

                count += 1
            except ValueError,e:
                errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id,str(e)))
            except TypeError,e:
                errors.append(u'Package %s - Error decoding JSON object: %s' % (package.id,str(e)))
Esempio n. 11
0
    def update_extents(self):
        from ckan.model import PackageExtra, Package, Session
        conn = Session.connection()
        packages = [extra.package \
                    for extra in \
                    Session.query(PackageExtra).filter(PackageExtra.key == 'spatial').all()]

        errors = []
        count = 0
        for package in packages:
            try:
                value = package.extras['spatial']
                log.debug('Received: %r' % value)
                geometry = json.loads(value)

                count += 1
            except ValueError, e:
                errors.append(u'Package %s - Error decoding JSON object: %s' %
                              (package.id, str(e)))
            except TypeError, e:
                errors.append(u'Package %s - Error decoding JSON object: %s' %
                              (package.id, str(e)))
Esempio n. 12
0
    def _create_or_update_package(self,
                                  package_dict,
                                  harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an existing one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(
                    package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form ==
                        'package_show' else 'package_update_rest')(
                            context, package_dict)

                else:
                    log.info(
                        'No changes to package with GUID %s, skipping...' %
                        harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(
                        package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(
                        package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form ==
                    'package_show' else 'package_create_rest')(context,
                                                               package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError, e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
Esempio n. 13
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                existing_package_dict = self._find_existing_package(package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                            existing_package_dict['name'])
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Esempio n. 14
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags
            
            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                            existing_package_dict['name'])
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Esempio n. 15
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        '''
        try:
            # Change default schema
            schema = default_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                api_version = self.config.get('api_version','2')
                #TODO: use site user when available
                user_name = self.config.get('user',u'harvest')
            else:
                api_version = '2'
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags]
            tags = list(set(tags))
            package_dict['tags'] = tags

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)
                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

            except NotFound:
                # Package needs to be created

                # Check if name has not already been used
                package_dict['name'] = self._check_name(package_dict['name'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                new_package = get_action('package_create_rest')(context, package_dict)
                harvest_object.package_id = new_package['id']

            # Flag the other objects linking to this package as not current anymore
            from ckanext.harvest.model import harvest_object_table
            conn = Session.connection()
            u = update(harvest_object_table) \
                    .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                    .values(current=False)
            conn.execute(u, b_package_id=new_package['id'])
            Session.commit()

            # Flag this as the current harvest object

            harvest_object.package_id = new_package['id']
            harvest_object.current = True
            harvest_object.save()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Esempio n. 16
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        '''
        try:
            # Change default schema
            schema = default_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                api_version = self.config.get('api_version', '2')
                #TODO: use site user when available
                user_name = self.config.get('user', u'harvest')
            else:
                api_version = '2'
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags]
            tags = list(set(tags))
            package_dict['tags'] = tags

            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context,
                                                                   data_dict)
                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    new_package = get_action('package_update_rest')(
                        context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' %
                             harvest_object.guid)
                    return

            except NotFound:
                # Package needs to be created

                # Check if name has not already been used
                package_dict['name'] = self._check_name(package_dict['name'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                new_package = get_action('package_create_rest')(context,
                                                                package_dict)
                harvest_object.package_id = new_package['id']

            # Flag the other objects linking to this package as not current anymore
            from ckanext.harvest.model import harvest_object_table
            conn = Session.connection()
            u = update(harvest_object_table) \
                    .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                    .values(current=False)
            conn.execute(u, b_package_id=new_package['id'])
            Session.commit()

            # Flag this as the current harvest object

            harvest_object.package_id = new_package['id']
            harvest_object.current = True
            harvest_object.save()

            return True

        except ValidationError, e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
Esempio n. 17
0
    def _create_or_update_package(self,
                                  package_dict,
                                  harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an existing one according to the
        package dictionary provided.
        The package dictionary can be in one of two forms:
        1. 'rest' - as seen on the RESTful API:
                http://datahub.io/api/rest/dataset/1996_population_census_data_canada
           This is the legacy form. It is the default to provide backward
           compatibility.
           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']
        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):
               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada
           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]
        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).
        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].
        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.
        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, six.text_type]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                package_dict['tags'] = self._clean_tags(tags)

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(
                    package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if 'metadata_modified' not in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified') or package_dict['name'] == "status-of-covid-19-cases-in-ontario-by-public-health-unit-phu" or package_dict['id'] == 'ecb75ea0-8b72-4f46-a14a-9bd54841d6ab':
                    log.info(
                        'Package with GUID %s exists and needs to be updated' %
                        harvest_object.guid)
                    # Update package
                    context.update({'id': package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])
                    '''
                    	what we want to do here is
                    		- not overwrite maintainer name or maintainer email or maintainer branch with blank information
                    		- not include resources because it will overwrite the existing resources
                            - match owner_org
                            - not overwrite all keywords (just add)
                    '''

                    package_dict['keywords'] = {
                        "en":
                        list(
                            set(existing_package_dict['keywords']['en'] +
                                package_dict['keywords']['en'])),
                        "fr":
                        list(
                            set(existing_package_dict['keywords']['fr'] +
                                package_dict['keywords']['fr']))
                    }
                    package_dict['owner_org'] = package_dict['organization'][
                        'name']
                    package_dict['harvester'] = "ontario-data-catalogue"
                    if package_dict.get("maintainer_email", "") == "":
                        del package_dict['maintainer_email']
                    if "maintainer_translated" in package_dict:
                        if package_dict['maintainer_translated'].get(
                                "en", ""
                        ) == "" and package_dict['maintainer_translated'].get(
                                "fr", "") == "":
                            del package_dict['maintainer_translated']
                        elif package_dict['maintainer_translated'].get(
                                "en", ""
                        ) != "" and package_dict['maintainer_translated'].get(
                                "fr", "") == "":
                            package_dict['maintainer_translated'][
                                'fr'] = package_dict['maintainer_translated'][
                                    'en']
                        elif package_dict['maintainer_translated'].get(
                                "en", ""
                        ) == "" and package_dict['maintainer_translated'].get(
                                "fr", "") != "":
                            package_dict['maintainer_translated'][
                                'en'] = package_dict['maintainer_translated'][
                                    'fr']
                    if "maintainer_branch" in package_dict:
                        if package_dict['maintainer_branch'].get(
                                "en", ""
                        ) == "" and package_dict['maintainer_branch'].get(
                                "fr", "") == "":
                            del package_dict['maintainer_branch']

                    if 'resources' in package_dict:
                        for resource in package_dict['resources']:
                            resource.update({"harvested_resource": True})
                            resource_context = {
                                'model': model,
                                'session': Session,
                                'user': user_name,
                                'api_version': api_version,
                                'id': resource['id'],
                                'ignore_auth': True,
                            }
                            p.toolkit.get_action(
                                "resource_patch" if resource['id'] in list(
                                    map(lambda x: x["id"],
                                        existing_package_dict["resources"])
                                ) else "resource_create")(resource_context,
                                                          resource)
                        list_of_remote_resources = list(
                            map(lambda x: x["id"], package_dict["resources"]))
                        for resource in list(
                                filter(
                                    lambda x: x["harvested_resource"] == True,
                                    existing_package_dict["resources"])):
                            # if there's a harvested resource locally that isn't in the latest harvested list of resources, delete it
                            if resource['id'] not in list_of_remote_resources:
                                resource_context = {
                                    'model': model,
                                    'session': Session,
                                    'user': user_name,
                                    'api_version': api_version,
                                    'id': resource['id'],
                                    'ignore_auth': True,
                                }
                                p.toolkit.get_action("resource_delete")(
                                    resource_context, {
                                        'id': resource['id']
                                    })

                        del package_dict['resources']
                    new_package = p.toolkit.get_action("package_patch")(
                        context, package_dict)

                else:
                    log.info(
                        'No changes to package with GUID %s, skipping...' %
                        harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table)\
                    .where(harvest_object_table.c.package_id == bindparam('b_package_id')) \
                    .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(
                        package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(
                        package_dict['title'])

                log.info(
                    'Package with GUID %s does not exist, let\'s create it' %
                    harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute(
                    'SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                package_dict['owner_org'] = package_dict['organization'][
                    'name']
                package_dict['harvester'] = "ontario-data-catalogue"
                for resource in package_dict['resources']:
                    resource.update({"harvested_resource": True})

                if package_dict.get("maintainer_email", "") == "":
                    package_dict['maintainer_email'] = "*****@*****.**"
                if "maintainer_translated" in package_dict:
                    if package_dict['maintainer_translated'].get(
                            "en", ""
                    ) == "" and package_dict['maintainer_translated'].get(
                            "fr", "") == "":
                        package_dict['maintainer_translated'] = {
                            "en": "Open Data",
                            "fr": "Données ouvertes"
                        }
                    elif package_dict['maintainer_translated'].get(
                            "en", ""
                    ) != "" and package_dict['maintainer_translated'].get(
                            "fr", "") == "":
                        package_dict['maintainer_translated'][
                            'fr'] = package_dict['maintainer_translated']['en']
                    elif package_dict['maintainer_translated'].get(
                            "en", ""
                    ) == "" and package_dict['maintainer_translated'].get(
                            "fr", "") != "":
                        package_dict['maintainer_translated'][
                            'en'] = package_dict['maintainer_translated']['fr']
                else:
                    package_dict['maintainer_translated'] = {
                        "en": "Open Data",
                        "fr": "Données ouvertes"
                    }
                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form ==
                    'package_show' else 'package_create_rest')(context,
                                                               package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError as e:
            log.exception(e)
            self._save_object_error(
                'Invalid package with GUID %s: %r' %
                (harvest_object.guid, e.error_dict), harvest_object, 'Import')
        except Exception as e:
            log.exception(e)
            self._save_object_error('%r' % e, harvest_object, 'Import')

        return None
Esempio n. 18
0
    def _create_or_update_package(self, package_dict, harvest_object):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided. The package dictionary should look like
        the REST API response for a package:

        http://ckan.net/api/rest/package/statistics-catalunya

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''

        log.debug('_create_or_update_package')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')

                #TODO: use site user when available
                user_name = self.config.get('user', u'harvest')
            else:
                api_version = 2
                user_name = u'harvest'

            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            tags = package_dict.get('tags', [])
            tags = [munge_tag(t) for t in tags]
            tags = list(set(tags))
            package_dict['tags'] = tags
            #log.debug('tag tag tag tag')
            #log.debug(tag)


            # Check if package exists
            data_dict = {}
            data_dict['id'] = package_dict['id']
            try:
                existing_package_dict = get_action('package_show')(context, data_dict)
                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    new_package = get_action('package_update_rest')(context, package_dict)

                else:
                    log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
                    return

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except NotFound:
                # Package needs to be created

                # Check if name has not already been used
                package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = get_action('package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except ValidationError,e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')
Esempio n. 19
0
    def _create_or_update_package(self, package_dict, harvest_object,
                                  package_dict_form='rest'):
        '''
        Creates a new package or updates an exisiting one according to the
        package dictionary provided.

        The package dictionary can be in one of two forms:

        1. 'rest' - as seen on the RESTful API:

                http://datahub.io/api/rest/dataset/1996_population_census_data_canada

           This is the legacy form. It is the default to provide backward
           compatibility.

           * 'extras' is a dict e.g. {'theme': 'health', 'sub-theme': 'cancer'}
           * 'tags' is a list of strings e.g. ['large-river', 'flood']

        2. 'package_show' form, as provided by the Action API (CKAN v2.0+):

               http://datahub.io/api/action/package_show?id=1996_population_census_data_canada

           * 'extras' is a list of dicts
                e.g. [{'key': 'theme', 'value': 'health'},
                        {'key': 'sub-theme', 'value': 'cancer'}]
           * 'tags' is a list of dicts
                e.g. [{'name': 'large-river'}, {'name': 'flood'}]

        Note that the package_dict must contain an id, which will be used to
        check if the package needs to be created or updated (use the remote
        dataset id).

        If the remote server provides the modification date of the remote
        package, add it to package_dict['metadata_modified'].

        :returns: The same as what import_stage should return. i.e. True if the
                  create or update occurred ok, 'unchanged' if it didn't need
                  updating or False if there were errors.


        TODO: Not sure it is worth keeping this function. If useful it should
        use the output of package_show logic function (maybe keeping support
        for rest api based dicts
        '''
        assert package_dict_form in ('rest', 'package_show')
        try:
            # Change default schema
            schema = default_create_package_schema()
            schema['id'] = [ignore_missing, unicode]
            schema['__junk'] = [ignore]

            # Check API version
            if self.config:
                try:
                    api_version = int(self.config.get('api_version', 2))
                except ValueError:
                    raise ValueError('api_version must be an integer')
            else:
                api_version = 2

            user_name = self._get_user_name()
            context = {
                'model': model,
                'session': Session,
                'user': user_name,
                'api_version': api_version,
                'schema': schema,
                'ignore_auth': True,
            }

            if self.config and self.config.get('clean_tags', False):
                tags = package_dict.get('tags', [])
                tags = [munge_tag(t) for t in tags if munge_tag(t) != '']
                tags = list(set(tags))
                package_dict['tags'] = tags

            # Check if package exists
            try:
                # _find_existing_package can be overridden if necessary
                existing_package_dict = self._find_existing_package(package_dict)

                # In case name has been modified when first importing. See issue #101.
                package_dict['name'] = existing_package_dict['name']

                # Check modified date
                if not 'metadata_modified' in package_dict or \
                   package_dict['metadata_modified'] > existing_package_dict.get('metadata_modified'):
                    log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid)
                    # Update package
                    context.update({'id':package_dict['id']})
                    package_dict.setdefault('name',
                                            existing_package_dict['name'])

                    new_package = p.toolkit.get_action(
                        'package_update' if package_dict_form == 'package_show'
                        else 'package_update_rest')(context, package_dict)

                else:
                    log.info('No changes to package with GUID %s, skipping...' % harvest_object.guid)
                    # NB harvest_object.current/package_id are not set
                    return 'unchanged'

                # Flag the other objects linking to this package as not current anymore
                from ckanext.harvest.model import harvest_object_table
                conn = Session.connection()
                u = update(harvest_object_table) \
                        .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
                        .values(current=False)
                conn.execute(u, b_package_id=new_package['id'])

                # Flag this as the current harvest object

                harvest_object.package_id = new_package['id']
                harvest_object.current = True
                harvest_object.save()

            except p.toolkit.ObjectNotFound:
                # Package needs to be created

                # Get rid of auth audit on the context otherwise we'll get an
                # exception
                context.pop('__auth_audit', None)

                # Set name for new package to prevent name conflict, see issue #117
                if package_dict.get('name', None):
                    package_dict['name'] = self._gen_new_name(package_dict['name'])
                else:
                    package_dict['name'] = self._gen_new_name(package_dict['title'])

                log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid)
                harvest_object.current = True
                harvest_object.package_id = package_dict['id']
                # Defer constraints and flush so the dataset can be indexed with
                # the harvest object id (on the after_show hook from the harvester
                # plugin)
                harvest_object.add()

                model.Session.execute('SET CONSTRAINTS harvest_object_package_id_fkey DEFERRED')
                model.Session.flush()

                new_package = p.toolkit.get_action(
                    'package_create' if package_dict_form == 'package_show'
                    else 'package_create_rest')(context, package_dict)

            Session.commit()

            return True

        except p.toolkit.ValidationError, e:
            log.exception(e)
            self._save_object_error('Invalid package with GUID %s: %r'%(harvest_object.guid,e.error_dict),harvest_object,'Import')