def _insert(batch): try: self.collection.insert(batch) except DuplicateKeyError: # Duplicate key error - KE export does duplicate some records # So switch to bulk upsert for this operation log.error('Duplicate key error - switching to upsert') bulk = self.collection.initialize_unordered_bulk_op() for batch_record in batch: bulk.find({'_id': batch_record['_id']}).upsert().replace_one(batch_record) bulk.execute()
def _insert(batch): try: self.collection.insert(batch) except DuplicateKeyError: # Duplicate key error - KE export does duplicate some records # So switch to bulk upsert for this operation log.error('Duplicate key error - switching to upsert') bulk = self.collection.initialize_unordered_bulk_op() for batch_record in batch: bulk.find({ '_id': batch_record['_id'] }).upsert().replace_one(batch_record) bulk.execute()
def ckan_delete(remote_ckan, mongo_record): # To avoid circular imports, import the tasks we need to check here # Dataset tasks are dependent on the DeleteTask from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask from ke2mongo.tasks.artefact import ArtefactDatasetAPITask from ke2mongo.tasks.specimen import SpecimenDatasetAPITask # By default, use SpecimenDatasetAPITask task_cls = SpecimenDatasetAPITask # Override default class if is Index Lot or Artefact for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]: if t.record_type == mongo_record['ColRecordType']: task_cls = t break # Get the primary key for col in task_cls.columns: if col[1] == task_cls.datastore['primary_key']: primary_key_field = col break # Get the source primary key - this needs to be split on . as we have added the collection name ke_primary_key = primary_key_field[0].split('.')[1] # The name of the primary key field used in CKAN ckan_primary_key = primary_key_field[1] try: primary_key_value = mongo_record[ke_primary_key] except KeyError: log.error('No value for primary key %s', ke_primary_key) else: resource_id = get_resource_id(remote_ckan, task_cls.package['name']) if resource_id: try: # And delete the record from the datastore log.info('Deleting record from CKAN where %s=%s' % (ckan_primary_key, primary_key_value)) remote_ckan.action.datastore_delete( id=resource_id, filters={ckan_primary_key: primary_key_value}, force=True) except ckanapi.CKANAPIError: # We don't care if the record isn't found log.error('Record not found') else: log.error('No resource ID')
def ckan_delete(remote_ckan, mongo_record): # To avoid circular imports, import the tasks we need to check here # Dataset tasks are dependent on the DeleteTask from ke2mongo.tasks.indexlot import IndexLotDatasetAPITask from ke2mongo.tasks.artefact import ArtefactDatasetAPITask from ke2mongo.tasks.specimen import SpecimenDatasetAPITask # By default, use SpecimenDatasetAPITask task_cls = SpecimenDatasetAPITask # Override default class if is Index Lot or Artefact for t in [IndexLotDatasetAPITask, ArtefactDatasetAPITask]: if t.record_type == mongo_record['ColRecordType']: task_cls = t break # Get the primary key for col in task_cls.columns: if col[1] == task_cls.datastore['primary_key']: primary_key_field = col break # Get the source primary key - this needs to be split on . as we have added the collection name ke_primary_key = primary_key_field[0].split('.')[1] # The name of the primary key field used in CKAN ckan_primary_key = primary_key_field[1] try: primary_key_value = mongo_record[ke_primary_key] except KeyError: log.error('No value for primary key %s', ke_primary_key) else: resource_id = get_resource_id(remote_ckan, task_cls.package['name']) if resource_id: try: # And delete the record from the datastore log.info('Deleting record from CKAN where %s=%s' % (ckan_primary_key, primary_key_value)) remote_ckan.action.datastore_delete(id=resource_id, filters={ckan_primary_key: primary_key_value}, force=True) except ckanapi.CKANAPIError: # We don't care if the record isn't found log.error('Record not found') else: log.error('No resource ID')
def get_resource_id(remote_ckan, package_name): try: # Try and retrieve from cache return _cache[package_name] except KeyError: log.error('Not cached %s', package_name) # Load the package, so we can find the resource ID try: ckan_package = remote_ckan.action.package_show(id=package_name) _cache[package_name] = ckan_package['resources'][0]['id'] return _cache[package_name] except ckanapi.NotFound, e: print e log.error('CKAN Package %s not found', package_name) except ckanapi.CKANAPIError, e: print e log.error('CKAN API ERROR') raise
def process_record(self, data): # Only import if it's one of the record types we want record_type = data.get('ColRecordType', 'Missing') if record_type in self.excluded_types: log.debug('Skipping record %s: Excluded type %s', data['irn'], record_type) raise InvalidRecordException # Make sure the UUID is valid guid = data.get('AdmGUIDPreferredValue', None) if guid: try: UUID(guid, version=4) except ValueError: # print 'Skipping: not a valid UUID' # Value error - not a valid hex code for a UUID. # continue print 'ERROR: ', guid raise InvalidRecordException # If we don't have collection department, skip it if not data.get('ColDepartment', None): raise InvalidRecordException date_inserted = data.get('AdmDateInserted', None) # Some records have an invalid AdmDateInserted=20-09-27 # As we need this for the stats, we need to skip them - just checking against date length as it's much quicker if not date_inserted or len(DATE_FORMAT) != len(date_inserted): log.error('Skipping record %s: invalid AdmDateInserted %s', data['irn'], date_inserted) raise InvalidRecordException # For now, the mongo aggregator cannot handle int / bool in $concat # So properties that are used in dynamicProperties need to be cast as strings for i in [ 'DnaTotalVolume', 'FeaCultivated', 'MinMetRecoveryWeight', 'MinMetWeightAsRegistered' ]: if i in data: data[i] = str(data[i]) # If record is a CITES species, mark cites = True scientific_name = data.get('DarScientificName', None) if scientific_name and scientific_name in self.cites_species: data['cites'] = True # For the embargo date, we're going to use the latest of NhmSecEmbargoDate and NhmSecEmbargoExtensionDate # So loop through, convert to timestamp. embargo_list = [] for f in ['NhmSecEmbargoDate', 'NhmSecEmbargoExtensionDate']: if data.get(f): ts = self.date_to_timestamp(data.get(f)) else: ts = 0 embargo_list.append(ts) # Set the Real Embargo data to the largest embargo or extension date data['RealEmbargoDate'] = max(embargo_list) return super(MongoCatalogueTask, self).process_record(data)