Example #1
0
        def _insert(batch):

            try:
                self.collection.insert(batch)
            except DuplicateKeyError:
                # Duplicate key error - KE export does duplicate some records
                # So switch to bulk upsert for this operation

                log.error('Duplicate key error - switching to upsert')

                bulk = self.collection.initialize_unordered_bulk_op()
                for batch_record in batch:
                    bulk.find({'_id': batch_record['_id']}).upsert().replace_one(batch_record)

                bulk.execute()
Example #2
0
        def _insert(batch):

            try:
                self.collection.insert(batch)
            except DuplicateKeyError:
                # Duplicate key error - KE export does duplicate some records
                # So switch to bulk upsert for this operation

                log.error('Duplicate key error - switching to upsert')

                bulk = self.collection.initialize_unordered_bulk_op()
                for batch_record in batch:
                    bulk.find({
                        '_id': batch_record['_id']
                    }).upsert().replace_one(batch_record)

                bulk.execute()
Example #3
0
def ckan_delete(remote_ckan, mongo_record):

    # To avoid circular imports, import the tasks we need to check here
    # Dataset tasks are dependent on the DeleteTask
    from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask
    from ke2mongo.tasks.artefact import ArtefactDatasetAPITask
    from ke2mongo.tasks.specimen import SpecimenDatasetAPITask

    # By default, use SpecimenDatasetAPITask
    task_cls = SpecimenDatasetAPITask

    # Override default class if is Index Lot or Artefact
    for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]:
        if t.record_type == mongo_record['ColRecordType']:
            task_cls = t
            break

    # Get the primary key
    for col in task_cls.columns:
        if col[1] == task_cls.datastore['primary_key']:
            primary_key_field = col
            break

    # Get the source primary key - this needs to be split on . as we have added the collection name
    ke_primary_key = primary_key_field[0].split('.')[1]

    # The name of the primary key field used in CKAN
    ckan_primary_key = primary_key_field[1]

    try:
        primary_key_value = mongo_record[ke_primary_key]
    except KeyError:
        log.error('No value for primary key %s', ke_primary_key)
    else:
        resource_id = get_resource_id(remote_ckan, task_cls.package['name'])
        if resource_id:
            try:
                # And delete the record from the datastore
                log.info('Deleting record from CKAN where %s=%s' %
                         (ckan_primary_key, primary_key_value))
                remote_ckan.action.datastore_delete(
                    id=resource_id,
                    filters={ckan_primary_key: primary_key_value},
                    force=True)
            except ckanapi.CKANAPIError:
                # We don't care if the record isn't found
                log.error('Record not found')
        else:
            log.error('No resource ID')
Example #4
0
def ckan_delete(remote_ckan, mongo_record):

    # To avoid circular imports, import the tasks we need to check here
    # Dataset tasks are dependent on the DeleteTask
    from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask
    from ke2mongo.tasks.artefact import ArtefactDatasetAPITask
    from ke2mongo.tasks.specimen import SpecimenDatasetAPITask

    # By default, use SpecimenDatasetAPITask
    task_cls = SpecimenDatasetAPITask

    # Override default class if is Index Lot or Artefact
    for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]:
        if t.record_type == mongo_record['ColRecordType']:
            task_cls = t
            break

    # Get the primary key
    for col in task_cls.columns:
        if col[1] == task_cls.datastore['primary_key']:
            primary_key_field = col
            break

    # Get the source primary key - this needs to be split on . as we have added the collection name
    ke_primary_key = primary_key_field[0].split('.')[1]

    # The name of the primary key field used in CKAN
    ckan_primary_key = primary_key_field[1]

    try:
        primary_key_value = mongo_record[ke_primary_key]
    except KeyError:
        log.error('No value for primary key %s', ke_primary_key)
    else:
        resource_id = get_resource_id(remote_ckan, task_cls.package['name'])
        if resource_id:
            try:
                # And delete the record from the datastore
                log.info('Deleting record from CKAN where %s=%s' % (ckan_primary_key, primary_key_value))
                remote_ckan.action.datastore_delete(id=resource_id, filters={ckan_primary_key: primary_key_value}, force=True)
            except ckanapi.CKANAPIError:
                # We don't care if the record isn't found
                log.error('Record not found')
        else:
            log.error('No resource ID')
Example #5
0
def get_resource_id(remote_ckan, package_name):

    try:
        # Try and retrieve from cache
        return _cache[package_name]
    except KeyError:
        log.error('Not cached %s', package_name)
        # Load the package, so we can find the resource ID
        try:
            ckan_package = remote_ckan.action.package_show(id=package_name)
            _cache[package_name] = ckan_package['resources'][0]['id']
            return _cache[package_name]
        except ckanapi.NotFound, e:
            print e
            log.error('CKAN Package %s not found', package_name)
        except ckanapi.CKANAPIError, e:
            print e
            log.error('CKAN API ERROR')
            raise
Example #6
0
def get_resource_id(remote_ckan, package_name):

    try:
        # Try and retrieve from cache
        return _cache[package_name]
    except KeyError:
        log.error('Not cached %s', package_name)
        # Load the package, so we can find the resource ID
        try:
            ckan_package = remote_ckan.action.package_show(id=package_name)
            _cache[package_name] = ckan_package['resources'][0]['id']
            return _cache[package_name]
        except ckanapi.NotFound, e:
            print e
            log.error('CKAN Package %s not found', package_name)
        except ckanapi.CKANAPIError, e:
            print e
            log.error('CKAN API ERROR')
            raise
Example #7
0
    def process_record(self, data):

        # Only import if it's one of the record types we want
        record_type = data.get('ColRecordType', 'Missing')

        if record_type in self.excluded_types:
            log.debug('Skipping record %s: Excluded type %s', data['irn'],
                      record_type)
            raise InvalidRecordException

        # Make sure the UUID is valid

        guid = data.get('AdmGUIDPreferredValue', None)

        if guid:

            try:
                UUID(guid, version=4)
            except ValueError:
                # print 'Skipping: not a valid UUID'
                # Value error - not a valid hex code for a UUID.
                # continue
                print 'ERROR: ', guid
                raise InvalidRecordException

        # If we don't have collection department, skip it
        if not data.get('ColDepartment', None):
            raise InvalidRecordException

        date_inserted = data.get('AdmDateInserted', None)

        # Some records have an invalid AdmDateInserted=20-09-27
        # As we need this for the stats, we need to skip them - just checking against date length as it's much quicker
        if not date_inserted or len(DATE_FORMAT) != len(date_inserted):
            log.error('Skipping record %s: invalid AdmDateInserted %s',
                      data['irn'], date_inserted)
            raise InvalidRecordException

        # For now, the mongo aggregator cannot handle int / bool in $concat
        # So properties that are used in dynamicProperties need to be cast as strings
        for i in [
                'DnaTotalVolume', 'FeaCultivated', 'MinMetRecoveryWeight',
                'MinMetWeightAsRegistered'
        ]:
            if i in data:
                data[i] = str(data[i])

        # If record is a CITES species, mark cites = True
        scientific_name = data.get('DarScientificName', None)

        if scientific_name and scientific_name in self.cites_species:
            data['cites'] = True

        # For the embargo date, we're going to use the latest of NhmSecEmbargoDate and NhmSecEmbargoExtensionDate
        # So loop through, convert to timestamp.

        embargo_list = []

        for f in ['NhmSecEmbargoDate', 'NhmSecEmbargoExtensionDate']:
            if data.get(f):
                ts = self.date_to_timestamp(data.get(f))
            else:
                ts = 0
            embargo_list.append(ts)

        # Set the Real Embargo data to the largest embargo or extension date
        data['RealEmbargoDate'] = max(embargo_list)
        return super(MongoCatalogueTask, self).process_record(data)