def process(self, data): log.debug('Processing %s record %s', self.model_class.__name__.lower(), data['irn']) try: # Do we already have a record for this? record = self.get_record(data.get('irn')) # Is this a stub record? If it is, we want to change the type and reload. # Seems a bit of a hack, but SQLAlchemy does not have a simple way of modifying the type # This only runs for catalogue records if isinstance(record, StubModel): polymorphic_type = self.model_class.__mapper_args__['polymorphic_identity'] # Manually set type self.session.execute('UPDATE %s.catalogue SET type=:type WHERE irn=:irn' % self.keemu_schema, {'type': polymorphic_type, 'irn': data.get('irn')}) # If this has a child table, insert the IRN so updates will work if self.model_class.__mapper__.local_table.name != 'specimen': # And create empty row in the polymorphic table self.session.execute('INSERT INTO %s.%s (irn) VALUES (:irn)' % (self.keemu_schema, self.model_class.__mapper__.local_table.name), {'irn': data.get('irn')}) # Commit & expunge so the item can be reloaded self.session.commit() self.session.expunge(record) record = self.get_record(data.get('irn')) # Process the relationships data = self._process_relationships(data, record) # Populate the data record.rebuild(**data) except NoResultFound: data = self._process_relationships(data) # Create a new record record = self.model_class(**data) try: self.session.merge(record) self.session.commit() except DataError, e: # Save this error to the log - will need to follow up on these log.critical('DB DataError: record %s not created.' % data['irn'], {'data': data}, exc_info=e)
def get_model_class(self, data): """ Retrieve the model class for a specimen record, using candidate classes based on name, dept etc., """ model_class = None # If it doesn't even have a record type, what's the point of keeping it? if not 'ColRecordType' in data: log.debug('Skipping record %s: No record type', data['irn']) return None # Build an array of potential candidate classes candidate_classes = [] collection = data['ColKind'] if 'ColKind' in data else None collection_department = data['ColDepartment'] if 'ColDepartment' in data else None # KE EMU has case insensitive record types (2820735: specimen) # So make sure the first letter is capitalised record_type = data['ColRecordType'][0].capitalize() + data['ColRecordType'][1:] matches = self.re_model.match(record_type) if matches: cls = matches.group(0).replace(' ', '') if collection: # Add candidate class based on ColKind (used for mineralogy) MeteoritesSpecimenModel candidate_classes.append('{0}{1}Model'.format(data['ColKind'], cls)) if collection_department: # Add candidate class BotanySpecimenModel candidate_classes.append('{0}{1}Model'.format(collection_department, cls)) # Add candidate class SpecimenModel, ArtefactModel candidate_classes.append('{0}Model'.format(cls)) for candidate_class in candidate_classes: if candidate_class in globals(): # Do we have a model class for this candidate model_class = globals()[candidate_class] break return model_class
def run(self): # Need to load an SQLA model # So build a dict of all models keyed by KE EMu module models = {} for cls in KEDataTask.__subclasses__(): models[cls.module] = cls.model_class if cls.model_class else CatalogueModel ke_data = KEParser(self.input().open('r'), schema_file=self.keemu_schema_file, input_file_path=self.input().path) for data in ke_data: module = data.get('AudTable') irn = data.get('AudKey') try: model = models[module] except KeyError: log.debug('Skipping eaudit record for %s' % module) else: try: log.debug('Deleting record %s(%s)' % (model, irn)) # Load the object and then delete so we use the SQLA inheritance obj = self.session.query(self.model).filter(self.model.irn == 1).one() self.session.delete(obj) except NoResultFound: # We cannot delete this record as it doesn't exist # There are a lot of records being inserted and then deleted again # So will never appear on the insert exports date_inserted = datetime.strptime(data.get('AdmDateInserted'),"%Y-%m-%d") date_deleted = datetime.strptime(data.get('AudDate'),"%Y-%m-%d") # If date deleted is within 7 days of the insert date, do not flag an error if date_deleted - timedelta(days=7) < date_inserted: log.debug('Record %s(%s) not found for deletion, but within date threshold (inserted: %s deleted: %s)' % (model.__name__, irn, date_inserted, date_deleted)) else: log.error('Record %s(%s) not found for deletion' % (model, irn)) self.session.commit() self.output().touch()
def process(self, data): # Try and get the model class self.model_class = self.get_model_class(data) # If we don't have a model class, continue to next record if not self.model_class: record_type = data.get('ColRecordType', 'Missing') # If record type is one we've knowingly excluded if record_type in self.excluded_types: log.debug('Skipping record %s: No model class for %s', data['irn'], record_type) else: # Critical error - log to DB log.critical('Unknown model class %s for %s. Investigate and then add to [excluded_types] if not required.', record_type, data['irn']) # Next record return # Filter out some of the records if not 'ColDepartment' in data: log.debug('Skipping record %s: No collection department', data['irn']) return None if not 'AdmDateInserted' in data: log.debug('Skipping record %s: No AdmDateInserted', data['irn']) return None # Skip records if SecRecordStatus is one of 'DELETE', 'Reserved', 'Stub', 'Stub Record', 'DELETE-MERGED' if 'SecRecordStatus' in data and data['SecRecordStatus'] in ['DELETE', 'Reserved', 'Stub', 'Stub Record', 'DELETE-MERGED']: log.debug('Skipping record %s: Incorrect record status', data['irn']) return None # Botany records include ones from Linnean Society. Should be excluded. if 'RegHerbariumCurrentOrgAcroLocal' in data and data['RegHerbariumCurrentOrgAcroLocal'] == 'LINN': log.debug('Skipping record %s: Non-BM botany record', data['irn']) return None # 4257 Artefacts have no kind or name. Skip them if data['ColRecordType'] == 'Artefact' and 'ArtKind' not in data and 'ArtName' not in data: return None # Process determinations determinations = data.get('EntIdeTaxonRef', None) or data.get('EntIndIndexLotTaxonNameLocalRef', None) if determinations: data['specimen_taxonomy'] = [] determinations = self.ensure_list(determinations) # Load the taxonomy records for these determinations taxonomy_records = self.session.query(TaxonomyModel).filter(TaxonomyModel.irn.in_(determinations)).all() # Loop through all retrieved taxonomy records, and add a determination for them # This will act as a filter, removing all duplicates / missing taxa for taxonomy_record in taxonomy_records: filed_as = (taxonomy_record.irn == data.get('EntIdeFiledAsTaxonRef', None)) data['specimen_taxonomy'].append(Determination(taxonomy_irn=taxonomy_record.irn, specimen_irn=data['irn'], filed_as=filed_as)) # Parasite card host / parasites host_parasites = { 'host': data.get('CardHostRef', []), 'parasite': data.get('CardParasiteRef', []), } stages = self.ensure_list(data.get('CardParasiteStage', [])) for host_parasite_type, refs in host_parasites.items(): refs = self.ensure_list(refs) for i, ref in enumerate(refs): try: stage = stages[i] except IndexError: stage = None assoc_object = HostParasiteAssociation(taxonomy_irn=ref, parasite_card_irn=data['irn'], parasite_host=host_parasite_type, stage=stage) try: data['host_parasite_taxonomy'].append(assoc_object) except KeyError: data['host_parasite_taxonomy'] = [assoc_object] # Some special field mappings # Try to use PalDetDate is if DarYearIdentified is missing if not 'DarYearIdentified' in data: try: date_matches = self.re_date.search(data['PalDetDate']) if date_matches: data['DarYearIdentified'] = date_matches.group(1) data['DarMonthIdentified'] = date_matches.group(2) data['DarDayIdentified'] = date_matches.group(3) except (KeyError, TypeError): # If PalDetDate doesn't exists or isn't a string (can also be a list if there's multiple determination dates - which we ignore) pass # EntCatCatalogueNumber requires EntCatPrefix if it's used in catalogue_number try: data['EntCatCatalogueNumber'] = '{0}{1}'.format(data['EntCatPrefix'], data['EntCatCatalogueNumber']) except KeyError: pass # Set egg part type if not already set if self.model_class is EggModel and 'PrtType' not in data: data['PrtType'] = 'egg' super(CatalogueTask, self).process(data)