def run(self): # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it if not self.force: raise Exception('Warning: this class does not delete CKAN records. Use --force to run it.') # Build a dict of all modules and collections # We then retrieve the appropriate collection from the records module name (AudTable) # Exclude the MongoDeleteTask though collections = {cls.module: cls(None).get_collection() for cls in MongoTask.__subclasses__()} ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file) for record in self.iterate_data(ke_data): module = record.get('AudTable') irn = record.get('AudKey') try: collection = collections[module] except KeyError: log.debug('Skipping eaudit record for %s' % module) # We do not have a collection for this module - skip to next record continue else: log.info('Deleting record %s(%s)' % (module, irn)) self.delete(collection, irn) self.mark_complete()
def run(self): # Running this task doesn't delete anything from CKAN itself - so require --force flag to be sent to run it if not self.force: raise Exception( 'Warning: this class does not delete CKAN records. Use --force to run it.' ) # Build a dict of all modules and collections # We then retrieve the appropriate collection from the records module name (AudTable) # Exclude the MongoDeleteTask though collections = { cls.module: cls(None).get_collection() for cls in MongoTask.__subclasses__() } ke_data = KEParser(self.input().open('r'), file_path=self.input().path, schema_file=self.keemu_schema_file) for record in self.iterate_data(ke_data): module = record.get('AudTable') irn = record.get('AudKey') try: collection = collections[module] except KeyError: log.debug('Skipping eaudit record for %s' % module) # We do not have a collection for this module - skip to next record continue else: log.info('Deleting record %s(%s)' % (module, irn)) self.delete(collection, irn) self.mark_complete()
def process_record(self, data): # Only import if it's one of the record types we want record_type = data.get('ColRecordType', 'Missing') if record_type in self.excluded_types: log.debug('Skipping record %s: Excluded type %s', data['irn'], record_type) raise InvalidRecordException # Make sure the UUID is valid guid = data.get('AdmGUIDPreferredValue', None) if guid: try: UUID(guid, version=4) except ValueError: # print 'Skipping: not a valid UUID' # Value error - not a valid hex code for a UUID. # continue print 'ERROR: ', guid raise InvalidRecordException # If we don't have collection department, skip it if not data.get('ColDepartment', None): raise InvalidRecordException date_inserted = data.get('AdmDateInserted', None) # Some records have an invalid AdmDateInserted=20-09-27 # As we need this for the stats, we need to skip them - just checking against date length as it's much quicker if not date_inserted or len(DATE_FORMAT) != len(date_inserted): log.error('Skipping record %s: invalid AdmDateInserted %s', data['irn'], date_inserted) raise InvalidRecordException # For now, the mongo aggregator cannot handle int / bool in $concat # So properties that are used in dynamicProperties need to be cast as strings for i in [ 'DnaTotalVolume', 'FeaCultivated', 'MinMetRecoveryWeight', 'MinMetWeightAsRegistered' ]: if i in data: data[i] = str(data[i]) # If record is a CITES species, mark cites = True scientific_name = data.get('DarScientificName', None) if scientific_name and scientific_name in self.cites_species: data['cites'] = True # For the embargo date, we're going to use the latest of NhmSecEmbargoDate and NhmSecEmbargoExtensionDate # So loop through, convert to timestamp. embargo_list = [] for f in ['NhmSecEmbargoDate', 'NhmSecEmbargoExtensionDate']: if data.get(f): ts = self.date_to_timestamp(data.get(f)) else: ts = 0 embargo_list.append(ts) # Set the Real Embargo data to the largest embargo or extension date data['RealEmbargoDate'] = max(embargo_list) return super(MongoCatalogueTask, self).process_record(data)