def run( # Do we need other default functions here? #extract_api_persons=extract_api_persons, db_name=db_name, transaction_record_limit=transaction_record_limit, experts_etl_logger=None): # This connection API in general needs work. Including this here for the sake of consistency # with other ETL module.run() functions. sqlapi.setengine(db.engine(db_name)) if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info('starting: oit -> edw', extra={'pure_sync_job': 'person'}) engine = db.engine(db_name) prepare_target_scratch_tables(engine) with db.session(db_name) as session: prepare_source_tables(engine, session) load_count = 0 for demog in session.query( PureEligibleDemogChngHst.emplid).distinct().all(): # Stop-gap prevention of unique key constraint violation: if demog.emplid == '8004768': continue person_dict = transform(session, extract(session, demog.emplid)) if len(person_dict['jobs']) == 0 and len( person_dict['programs']) == 0: record_person_no_job_data_error( session=session, emplid=demog.emplid, internet_id=person_dict['internet_id'], ) continue load_into_scratch(session, person_dict) load_count += 1 if load_count >= transaction_record_limit: session.commit() load_count = 0 update_targets_from_scratch() session.commit() experts_etl_logger.info('ending: oit -> edw', extra={'pure_sync_job': 'person'})
def run(db_name=db_name, template=template, output_filename=output_filename, experts_etl_logger=None): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info('starting: edw -> pure', extra={'pure_sync_job': 'person'}) with open(output_filename, 'w') as output_file: output_file.write( '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n') output_file.write( '<persons xmlns="v1.unified-person-sync.pure.atira.dk" xmlns:v3="v3.commons.pure.atira.dk">\n' ) with db.session(db_name) as session: for person in session.query(PureSyncPersonData).all(): person_dict = { c.name: getattr(person, c.name) for c in person.__table__.columns } person_dict['jobs'] = [] for job in session.query(PureSyncStaffOrgAssociation).filter( PureSyncStaffOrgAssociation.person_id == person.person_id).all(): job_dict = { c.name: getattr(job, c.name) for c in job.__table__.columns } person_dict['jobs'].append(job_dict) person_dict['programs'] = [] for program in session.query( PureSyncStudentOrgAssociation).filter( PureSyncStudentOrgAssociation.person_id == person.person_id).all(): program_dict = { c.name: getattr(program, c.name) for c in program.__table__.columns } person_dict['programs'].append(program_dict) output_file.write(template.render(person_dict)) output_file.write('</persons>') experts_etl_logger.info('ending: edw -> pure', extra={'pure_sync_job': 'person'})
def run( db_name=db_name, template=template, output_filename=output_filename, experts_etl_logger=None ): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info('starting: edw -> pure', extra={'pure_sync_job': 'user'}) with open(output_filename, 'w') as output_file: output_file.write('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n') output_file.write('<users xmlns="v1.user-sync.pure.atira.dk" xmlns:v3="v3.commons.pure.atira.dk">\n') with db.session(db_name) as session: for user in session.query(PureSyncUserData).all(): user_dict = {c.name: getattr(user, c.name) for c in user.__table__.columns} output_file.write(template.render(user_dict)) output_file.write('</users>') experts_etl_logger.info('ending: edw -> pure', extra={'pure_sync_job': 'user'})
def run( startdate=None, startdate_str=None, # yyyy-MM-dd or yyyy-MM-dd_HH-mm-ss format db_name=db_name, transaction_record_limit=transaction_record_limit, experts_etl_logger=None, pure_api_config=None ): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info('starting: extracting/loading', extra={'pure_api_record_type': pure_api_record_type}) if pure_api_config is None: pure_api_config = Config() with db.session(db_name) as session: record_count = 0 if startdate_str is None: startdate_str = ensure_valid_startdate(session, startdate).isoformat() for api_change in client.get_all_changes_transformed(startdate_str, config=pure_api_config): if not required_fields_exist(api_change): continue if api_change.familySystemName not in family_system_name_db_class_map: continue if matching_db_record_exists(session, api_change) \ and previously_processed_same_or_newer_change(session, api_change): continue if already_loaded_same_or_newer_change(session, api_change): continue load_api_change(session, api_change) record_count += 1 if (record_count % transaction_record_limit) == 0: session.commit() experts_etl_logger.info('ending: extracting/loading', extra={'pure_api_record_type': pure_api_record_type})
def session(): with db.session('hotel') as session: yield session
def run( # Do we need other default functions here? #extract_api_changes=extract_api_changes, db_name=db_name, transaction_record_limit=transaction_record_limit, experts_etl_logger=None, pure_api_config=None): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info( 'starting: extracting/loading', extra={'pure_api_record_type': pure_api_record_type}) if pure_api_config is None: pure_api_config = Config() # Capture the current record for each iteration, so we can log it in case of an exception: latest_change = None try: with db.session(db_name) as session: processed_changes = [] for changes in changes_for_family_ordered_by_uuid_version( session, 'ExternalPerson'): latest_change = changes[0] db_person = get_db_person(session, latest_change.uuid) # We delete here and continue, because there will be no record # to download from the Pure API when it has been deleted. if latest_change.change_type == 'DELETE': if db_person: delete_db_person(session, db_person) processed_changes.extend(changes) if len(processed_changes) >= transaction_record_limit: record_changes_as_processed(session, processed_changes) processed_changes = [] session.commit() continue r = None try: r = client.get(pure_api_record_type + '/' + latest_change.uuid, config=pure_api_config) except PureAPIHTTPError as e: if e.response.status_code == 404: if db_person: # This record has been deleted from Pure but still exists in our local db: delete_db_person(session, db_person) processed_changes.extend(changes) if len(processed_changes) >= transaction_record_limit: record_changes_as_processed( session, processed_changes) processed_changes = [] session.commit() else: experts_etl_logger.error( f'HTTP error {e.response.status_code} returned during record extraction', extra={ 'pure_uuid': latest_change.uuid, 'pure_api_record_type': pure_api_record_type }) continue except PureAPIRequestException as e: formatted_exception = loggers.format_exception(e) experts_etl_logger.error( f'mysterious client request exception encountered during record extraction: {formatted_exception}', extra={ 'pure_uuid': latest_change.uuid, 'pure_api_record_type': pure_api_record_type }) continue except Exception: raise api_external_person = response.transform( pure_api_record_type, r.json(), version=pure_api_config.version) delete_merged_records(session, api_external_person) if db_person and db_person_same_or_newer_than_api_external_person( session, db_person, api_external_person): continue if already_loaded_same_api_external_person( session, api_external_person): continue load_api_external_person(session, api_external_person, r.text) processed_changes.extend(changes) if len(processed_changes) >= transaction_record_limit: record_changes_as_processed(session, processed_changes) processed_changes = [] session.commit() record_changes_as_processed(session, processed_changes) session.commit() except Exception as e: formatted_exception = loggers.format_exception(e) experts_etl_logger.error( f'exception encountered during record extraction: {formatted_exception}', extra={ 'pure_uuid': latest_change.uuid, 'pure_api_record_type': pure_api_record_type }) experts_etl_logger.info( 'ending: extracting/loading', extra={'pure_api_record_type': pure_api_record_type})
def run( # Do we need other default functions here? extract_api_persons=extract_api_persons, db_name=db_name, transaction_record_limit=transaction_record_limit, pure_api_record_logger=pure_api_record_logger, experts_etl_logger=None, pure_api_config=None): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info( 'starting: transforming/loading', extra={'pure_api_record_type': pure_api_record_type}) if pure_api_config is None: pure_api_config = Config() # Capture the current record for each iteration, so we can log it in case of an exception: api_person = None try: with db.session(db_name) as session: processed_api_person_uuids = [] for db_api_person in extract_api_persons(session): api_person = response.transform( pure_api_record_type, json.loads(db_api_person.json), version=pure_api_config.version) db_person = get_db_person(session, db_api_person.uuid) db_person_previously_existed = False if db_person: db_person_previously_existed = True if db_person.pure_modified and db_person.pure_modified >= db_api_person.modified: # Skip this record, since we already have a newer one: processed_api_person_uuids.append(db_api_person.uuid) continue else: db_person = create_db_person(api_person) db_person.internet_id = None db_person.first_name = api_person.name.firstName db_person.last_name = api_person.name.lastName db_person.pure_modified = db_api_person.modified # Doubt that we will ever get these for external persons: db_person.orcid = None db_person.hindex = None # Check for orgs not in EDW yet: api_org_uuids = set() for org_assoc in api_person.externalOrganisations: api_org_uuids.add(org_assoc.uuid) db_org_uuids = set() if db_person_previously_existed: # Avoid trying to query a person that doesn't exist in the db yet: db_org_uuids = { db_org.pure_uuid for db_org in db_person.pure_orgs } api_only_org_uuids = api_org_uuids - db_org_uuids db_only_org_uuids = db_org_uuids - api_org_uuids # For now, skip this person if there are any orgs referenced in the api record # that we don't have in EDW: if len(api_only_org_uuids) > 0: api_only_orgs_in_db = session.query(PureOrg).filter( PureOrg.pure_uuid.in_(api_only_org_uuids)).all() if len(api_only_org_uuids) > len(api_only_orgs_in_db): experts_etl_logger.info( 'skipping updates: some associated orgs do not exist in EDW.', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) continue # Now we can add the person to the session, because there are no other # reasons for intentionally skipping it: session.add(db_person) ## person pure orgs for org_uuid in api_only_org_uuids: person_pure_org = PersonPureOrg( person_uuid=db_person.uuid, pure_org_uuid=org_uuid, ) session.add(person_pure_org) session.query(PersonPureOrg).filter( PersonPureOrg.person_uuid == db_person.uuid, PersonPureOrg.pure_org_uuid.in_(db_only_org_uuids)).delete( synchronize_session=False) ## scopus ids db_scopus_ids = set() if db_person_previously_existed: # Avoid trying to query a person that doesn't exist in the db yet: db_scopus_ids = set(db_person.scopus_ids) person_ids = get_person_ids(api_person) api_only_scopus_ids = person_ids['scopus_ids'] - db_scopus_ids db_only_scopus_ids = db_scopus_ids - person_ids['scopus_ids'] for scopus_id in api_only_scopus_ids: person_scopus_id = PersonScopusId( person_uuid=db_person.uuid, scopus_id=scopus_id, ) session.add(person_scopus_id) session.query(PersonScopusId).filter( PersonScopusId.person_uuid == db_person.uuid, PersonScopusId.scopus_id.in_(db_only_scopus_ids)).delete( synchronize_session=False) processed_api_person_uuids.append(api_person.uuid) if len(processed_api_person_uuids) >= transaction_record_limit: mark_api_persons_as_processed(session, pure_api_record_logger, processed_api_person_uuids) processed_api_person_uuids = [] session.commit() mark_api_persons_as_processed(session, pure_api_record_logger, processed_api_person_uuids) session.commit() except Exception as e: formatted_exception = loggers.format_exception(e) experts_etl_logger.error( f'exception encountered during record transformation: {formatted_exception}', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) loggers.rollover(pure_api_record_logger) experts_etl_logger.info( 'ending: transforming/loading', extra={'pure_api_record_type': pure_api_record_type})
def run( # Do we need other default functions here? extract_api_pubs=extract_api_pubs, db_name=db_name, transaction_record_limit=transaction_record_limit, pure_api_record_logger=pure_api_record_logger, experts_etl_logger=None, pure_api_config=None, force_load=False): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info( 'starting: transforming/loading', extra={'pure_api_record_type': pure_api_record_type}) if pure_api_config is None: pure_api_config = Config() # Capture the current record for each iteration, so we can log it in case of an exception: api_pub = None try: with db.session(db_name) as session: processed_api_pub_uuids = [] for db_api_pub in extract_api_pubs(session): api_pub = response.transform(pure_api_record_type, json.loads(db_api_pub.json), version=pure_api_config.version) db_pub = get_db_pub(session, db_api_pub.uuid) db_pub_previously_existed = False if db_pub: db_pub_previously_existed = True if db_pub.pure_modified and db_pub.pure_modified >= db_api_pub.modified and not force_load: # Skip this record, since we already have a newer one: processed_api_pub_uuids.append(db_api_pub.uuid) continue else: db_pub = create_db_pub(api_pub) pub_ids = get_pub_ids(api_pub) db_pub.scopus_id = pub_ids['scopus_id'] db_pub.pmid = pub_ids['pmid'] db_pub.doi = pub_ids['doi'] # Commented out for now, because we will rely more on pure types and subtypes (below): #db_pub.type = 'article-journal' type_uri_parts = api_pub.type.uri.split('/') type_uri_parts.reverse() pure_subtype, pure_type, pure_parent_type = type_uri_parts[0:3] db_pub.pure_type = pure_type db_pub.pure_subtype = pure_subtype db_pub.title = api_pub.title.value db_pub.container_title = api_pub.journalAssociation.title.value db_pub.issn = api_pub.journalAssociation.issn.value if 'issn' in api_pub.journalAssociation else None nullify_pub_states(db_pub) for api_pub_state in api_pub.publicationStatuses: update_pub_state(db_pub, api_pub_state) db_pub.volume = api_pub.volume db_pub.issue = api_pub.journalNumber db_pub.pages = api_pub.pages db_pub.citation_total = api_pub.totalScopusCitations db_pub.pure_modified = db_api_pub.modified if 'managingOrganisationalUnit' in api_pub: owner_pure_org_uuid = api_pub.managingOrganisationalUnit.uuid owner_pure_org = session.query(PureOrg).filter( PureOrg.pure_uuid == owner_pure_org_uuid).one_or_none() if owner_pure_org == None: experts_etl_logger.info( 'skipping updates: owner pure org does not exist in EDW.', extra={ 'pure_uuid': api_pub.uuid, 'pure_api_record_type': pure_api_record_type }) continue db_pub.owner_pure_org_uuid = owner_pure_org_uuid else: # TODO: We do this because currently owner_pure_org_uuid is not null. We may want to change that. experts_etl_logger.info( 'skipping updates: no owner pure org.', extra={ 'pure_uuid': api_pub.uuid, 'pure_api_record_type': pure_api_record_type }) continue ## associations author_ordinal = 0 missing_person = False missing_person_pure_uuid = False missing_org = False pub_author_collabs = [] all_author_collab_uuids = set() all_person_uuids = set() pub_persons = [] pub_person_pure_orgs = [] # personAssociations can contain authorCollaboration's, which are not persons at all, # so we call this variable author_assoc, to be more accurate here: for author_assoc in api_pub.personAssociations: author_ordinal += 1 if 'authorCollaboration' in author_assoc: author_collab_assoc = author_assoc author_collab_pure_uuid = author_collab_assoc.authorCollaboration.uuid # Sometimes Pure records contain duplicate author collaborations. Grrr... if author_collab_pure_uuid in all_author_collab_uuids: continue all_author_collab_uuids.add(author_collab_pure_uuid) db_author_collab = session.query( AuthorCollaboration).filter( AuthorCollaboration.pure_uuid == author_collab_pure_uuid).one_or_none() if db_author_collab is None: db_author_collab = AuthorCollaboration( uuid=str(uuid.uuid4()), pure_uuid=author_collab_pure_uuid, ) # This is less than ideal, but for now we just update the author collaboration # name with whatever value this record includes: db_author_collab.name = next( (author_collab_text.value for author_collab_text in author_collab_assoc.authorCollaboration.name.text if author_collab_text.locale == 'en_US'), None) session.add(db_author_collab) pub_author_collab = PubAuthorCollaboration( pub_uuid=db_pub.uuid, author_collaboration_uuid=db_author_collab.uuid, author_ordinal=author_ordinal, # TODO: This needs work. We may have tried mapping these to CSL values at # one point, but now we're just taking what Pure gives us. author_role=next( (author_role_text.value for author_role_text in author_collab_assoc.personRole.term.text if author_role_text.locale == 'en_US'), None).lower(), ) pub_author_collabs.append(pub_author_collab) continue person_assoc = author_assoc person_pure_uuid = None if 'person' in person_assoc: person_pure_uuid = person_assoc.person.uuid person_pure_internal = 'Y' if 'externalPerson' in person_assoc: person_pure_uuid = person_assoc.externalPerson.uuid person_pure_internal = 'N' if person_assoc is not None and person_pure_uuid is None: missing_person_pure_uuid = True break db_person = session.query(Person).filter( Person.pure_uuid == person_pure_uuid).one_or_none() if db_person == None: missing_person = True break if db_person.uuid not in all_person_uuids: pub_person = PubPerson( pub_uuid=db_pub.uuid, person_uuid=db_person.uuid, person_ordinal=author_ordinal, # TODO: This needs work. We may have tried mapping these to CSL values at # one point, but now we're just taking what Pure gives us. person_role=next( (person_role_text.value for person_role_text in person_assoc.personRole.term.text if person_role_text.locale == 'en_US'), None).lower(), person_pure_internal=person_pure_internal, first_name=person_assoc.name.firstName if 'firstName' in person_assoc.name else None, last_name=person_assoc.name.lastName if 'lastName' in person_assoc.name else None, emplid=db_person.emplid, ) pub_persons.append(pub_person) all_person_uuids.add(db_person.uuid) else: continue all_person_org_uuids = set() for api_pure_org in itertools.chain( person_assoc.organisationalUnits, person_assoc.externalOrganisations): db_pure_org = session.query(PureOrg).filter( PureOrg.pure_uuid == api_pure_org.uuid).one_or_none() if db_pure_org == None: missing_org = True break person_org_uuids = frozenset( [db_person.uuid, db_pure_org.pure_uuid]) if person_org_uuids not in all_person_org_uuids: pub_person_pure_org = PubPersonPureOrg( pub_uuid=db_pub.uuid, person_uuid=db_person.uuid, pure_org_uuid=db_pure_org.pure_uuid, ) pub_person_pure_orgs.append(pub_person_pure_org) all_person_org_uuids.add(person_org_uuids) else: continue if missing_org: break if missing_person: experts_etl_logger.info( 'skipping updates: one or more associated persons do not exist in EDW.', extra={ 'pure_uuid': api_pub.uuid, 'pure_api_record_type': pure_api_record_type }) continue if missing_person_pure_uuid: experts_etl_logger.info( 'skipping updates: one or more associated persons has no pure uuid.', extra={ 'pure_uuid': api_pub.uuid, 'pure_api_record_type': pure_api_record_type }) continue if missing_org: experts_etl_logger.info( 'skipping updates: one or more associated orgs do not exist in EDW.', extra={ 'pure_uuid': api_pub.uuid, 'pure_api_record_type': pure_api_record_type }) continue # Now we can add the pub to the session, because there are no other # reasons for intentionally skipping it: session.add(db_pub) # Now we can also delete and re-create the associations for this research output: session.query(PubAuthorCollaboration).filter( PubAuthorCollaboration.pub_uuid == db_pub.uuid).delete( synchronize_session=False) for pub_author_collab in pub_author_collabs: session.add(pub_author_collab) session.query(PubPerson).filter( PubPerson.pub_uuid == db_pub.uuid).delete( synchronize_session=False) for pub_person in pub_persons: session.add(pub_person) session.query(PubPersonPureOrg).filter( PubPersonPureOrg.pub_uuid == db_pub.uuid).delete( synchronize_session=False) for pub_person_pure_org in pub_person_pure_orgs: session.add(pub_person_pure_org) processed_api_pub_uuids.append(api_pub.uuid) if len(processed_api_pub_uuids) >= transaction_record_limit: mark_api_pubs_as_processed(session, pure_api_record_logger, processed_api_pub_uuids) processed_api_pub_uuids = [] session.commit() mark_api_pubs_as_processed(session, pure_api_record_logger, processed_api_pub_uuids) session.commit() except Exception as e: formatted_exception = loggers.format_exception(e) experts_etl_logger.error( f'exception encountered during record transformation: {formatted_exception}', extra={ 'pure_uuid': api_pub.uuid, 'pure_api_record_type': pure_api_record_type }) loggers.rollover(pure_api_record_logger) experts_etl_logger.info( 'ending: transforming/loading', extra={'pure_api_record_type': pure_api_record_type})
def run( # Do we need other default functions here? extract_api_orgs=extract_api_orgs, db_name=db_name, transaction_record_limit=transaction_record_limit, pure_api_record_logger=pure_api_record_logger, experts_etl_logger=None, pure_api_config=None): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info( 'starting: transforming/loading', extra={'pure_api_record_type': pure_api_record_type}) if pure_api_config is None: pure_api_config = Config() # Capture the current record for each iteration, so we can log it in case of an exception: api_org = None try: with db.session(db_name) as session: processed_api_org_uuids = [] for db_api_org in extract_api_orgs(session): api_org = response.transform(pure_api_record_type, json.loads(db_api_org.json), version=pure_api_config.version) db_org = get_db_org(session, db_api_org.uuid) if db_org: if db_org.pure_modified and db_org.pure_modified >= db_api_org.modified: # Skip this record, since we already have a newer one: processed_api_org_uuids.append(db_api_org.uuid) continue else: db_org = create_db_org(api_org) db_org.name_en = next((name_text.value for name_text in api_org.name.text if name_text.locale == 'en_US'), None) db_org.type = next( (type_text.value for type_text in api_org.type.term.text if type_text.locale == 'en_US'), None).lower() db_org.pure_modified = db_api_org.modified session.add(db_org) processed_api_org_uuids.append(api_org.uuid) if len(processed_api_org_uuids) >= transaction_record_limit: mark_api_orgs_as_processed(session, pure_api_record_logger, processed_api_org_uuids) processed_api_org_uuids = [] session.commit() mark_api_orgs_as_processed(session, pure_api_record_logger, processed_api_org_uuids) session.commit() except Exception as e: formatted_exception = loggers.format_exception(e) experts_etl_logger.error( f'exception encountered during record transformation: {formatted_exception}', extra={ 'pure_uuid': api_org.uuid, 'pure_api_record_type': pure_api_record_type }) loggers.rollover(pure_api_record_logger) experts_etl_logger.info( 'ending: transforming/loading', extra={'pure_api_record_type': pure_api_record_type})
def run( # Do we need other default functions here? extract_api_persons=extract_api_persons, db_name=db_name, transaction_record_limit=transaction_record_limit, pure_api_record_logger=pure_api_record_logger, experts_etl_logger=None, pure_api_config=None): if experts_etl_logger is None: experts_etl_logger = loggers.experts_etl_logger() experts_etl_logger.info( 'starting: transforming/loading', extra={'pure_api_record_type': pure_api_record_type}) if pure_api_config is None: pure_api_config = Config() # Capture the current record for each iteration, so we can log it in case of an exception: api_person = None try: with db.session(db_name) as session: processed_api_person_uuids = [] for db_api_person in extract_api_persons(session): api_person = response.transform( pure_api_record_type, json.loads(db_api_person.json), version=pure_api_config.version) person_ids = get_person_ids(api_person) # Not sure that we should be requiring either of these in EDW, but we do for now, # in umn_person_pure_org, at least: if person_ids['emplid'] is None: experts_etl_logger.info( 'skipping updates: missing emplid.', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) continue if api_person.externalId is None: experts_etl_logger.info( 'skipping updates: missing pure id.', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) continue db_person = get_db_person(session, person_ids['emplid']) db_person_previously_existed = False if db_person: db_person_previously_existed = True if db_person.pure_modified and db_person.pure_modified >= db_api_person.modified: # Skip this record, since we already have a newer one: processed_api_person_uuids.append(db_api_person.uuid) continue else: db_person = create_db_person(person_ids['emplid']) # All internal persons should have this. Usually it will be the same as the emplid, # but sometimes the old SciVal identifier. db_person.pure_id = api_person.externalId db_person.pure_uuid = api_person.uuid db_person.internet_id = person_ids['internet_id'] db_person.first_name = api_person.name.firstName db_person.last_name = api_person.name.lastName db_person.orcid = api_person.orcid db_person.hindex = api_person.scopusHIndex db_person.pure_modified = db_api_person.modified # TODO: # 2. Can internal persons have externalOrganisationAssociations (sp?)? # Not sure yet, but they can have honoraryStaffOrganisationAssociations! # 3. If a person has an association with on org not in EDW yet, skip that person... # or should we just immediately attempt to fetch the org and update EDW? # For now, we'll skip that person. # Check for orgs not in EDW yet: api_org_uuids = set() for org_assoc in api_person.staffOrganisationAssociations: api_org_uuids.add(org_assoc.organisationalUnit.uuid) db_org_uuids = set() if db_person_previously_existed: # Avoid trying to query a person that doesn't exist in the db yet: db_org_uuids = { db_org.pure_uuid for db_org in db_person.pure_orgs } api_only_org_uuids = api_org_uuids - db_org_uuids db_only_org_uuids = db_org_uuids - api_org_uuids # For now, skip this person if there are any orgs referenced in the api record # that we don't have in EDW: if len(api_only_org_uuids) > 0: api_only_orgs_in_db = session.query(PureOrg).filter( PureOrg.pure_uuid.in_(api_only_org_uuids)).all() if len(api_only_org_uuids) > len(api_only_orgs_in_db): experts_etl_logger.info( 'skipping updates: some associated orgs do not exist in EDW.', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) continue ## umn person pure orgs aka staff organisation associations aka jobs # TODO: We may encounter duplicate jobs that break our uniqueness constraints. found_missing_job_description = False all_umn_person_pure_org_primary_keys = set() umn_person_pure_orgs = [] for org_assoc in api_person.staffOrganisationAssociations: job_description = next( (job_description_text.value for job_description_text in org_assoc.jobDescription.text if job_description_text.locale == 'en_US'), None) if job_description is None: found_missing_job_description = True break # Due to transitioning from master list to xml syncs of jobs, we may encounter duplicates. # This may also happen due to manual entering of jobs in the Pure UI. umn_person_pure_org_primary_keys = frozenset([ 'person_uuid:' + db_person.uuid, 'pure_org_uuid:' + org_assoc.organisationalUnit.uuid, 'job_description:' + job_description, 'start_date:' + org_assoc.period.startDate, ]) if umn_person_pure_org_primary_keys in all_umn_person_pure_org_primary_keys: experts_etl_logger.info( 'duplicate job found', extra={ 'umn_person_pure_org_primary_keys': umn_person_pure_org_primary_keys, 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type, }) continue all_umn_person_pure_org_primary_keys.add( umn_person_pure_org_primary_keys) umn_person_pure_org = UmnPersonPureOrg( pure_org_uuid=org_assoc.organisationalUnit.uuid, person_uuid=db_person.uuid, pure_person_id=db_person.pure_id, emplid=db_person.emplid, # Skipping this for now: pure_org_id=None, job_description=job_description, # Note: Both employmentType and staffType may be missing because they are not required # fields in the Pure UI, which UMN staff sometimes use to enter jobs not in PeopleSoft. # Called employed_as in EDW, which was all 'Academic' as of 2018-06-05. # Probably due to an artifact of the old master list upload process, or a # misunderstanding of it. The newer EDW tables for upload (sync) to Pure # have similar values in default_employed_as, but they're the last segment of the # employmentType uri. employed_as=next( (employment_type_text.value for employment_type_text in org_assoc.employmentType.term.text if employment_type_text.locale == 'en_US'), None), # Sometimes staffType will be 'non-academic', but we allowed space in EDW # only for 'nonacademic' (without a hyphen): staff_type=re.sub( '[^a-zA-Z]+', '', next((staff_type_text.value for staff_type_text in org_assoc.staffType.term.text if staff_type_text.locale == 'en_US'), None).lower()), start_date=isoparse(org_assoc.period.startDate), end_date=isoparse(org_assoc.period.endDate) if org_assoc.period.endDate else None, primary='Y' if org_assoc.isPrimaryAssociation == True else 'N', ) umn_person_pure_orgs.append(umn_person_pure_org) if found_missing_job_description: experts_etl_logger.info( 'skipping updates: one or more org associations are missing job descriptions', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) continue # Now we can add the person to the session, because there are no other # reasons for intentionally skipping it: session.add(db_person) # Now we can add the umn person pure orgs (i.e., jobs) to the session, too. # These are so complex, it's easiest to just delete and re-create them: session.query(UmnPersonPureOrg).filter( UmnPersonPureOrg.person_uuid == db_person.uuid).delete() for umn_person_pure_org in umn_person_pure_orgs: session.add(umn_person_pure_org) ## person pure orgs for org_uuid in api_only_org_uuids: person_pure_org = PersonPureOrg( person_uuid=db_person.uuid, pure_org_uuid=org_uuid, ) session.add(person_pure_org) session.query(PersonPureOrg).filter( PersonPureOrg.person_uuid == db_person.uuid, PersonPureOrg.pure_org_uuid.in_(db_only_org_uuids)).delete( synchronize_session=False) ## scopus ids db_scopus_ids = set() if db_person_previously_existed: # Avoid trying to query a person that doesn't exist in the db yet: db_scopus_ids = set(db_person.scopus_ids) api_only_scopus_ids = person_ids['scopus_ids'] - db_scopus_ids db_only_scopus_ids = db_scopus_ids - person_ids['scopus_ids'] for scopus_id in api_only_scopus_ids: person_scopus_id = PersonScopusId( person_uuid=db_person.uuid, scopus_id=scopus_id, ) session.add(person_scopus_id) session.query(PersonScopusId).filter( PersonScopusId.person_uuid == db_person.uuid, PersonScopusId.scopus_id.in_(db_only_scopus_ids)).delete( synchronize_session=False) processed_api_person_uuids.append(api_person.uuid) if len(processed_api_person_uuids) >= transaction_record_limit: mark_api_persons_as_processed(session, pure_api_record_logger, processed_api_person_uuids) processed_api_person_uuids = [] session.commit() mark_api_persons_as_processed(session, pure_api_record_logger, processed_api_person_uuids) session.commit() except Exception as e: formatted_exception = loggers.format_exception(e) experts_etl_logger.error( f'exception encountered during record transformation: {formatted_exception}', extra={ 'pure_uuid': api_person.uuid, 'pure_api_record_type': pure_api_record_type }) loggers.rollover(pure_api_record_logger) experts_etl_logger.info( 'ending: transforming/loading', extra={'pure_api_record_type': pure_api_record_type})