def get_pathways(site, pathway_id=None): """ Read pathways from the cache. The cache is populated by a management command, cache_programs. Arguments: site (Site): django.contrib.sites.models object Keyword Arguments: pathway_id (string): id identifying a specific pathway to read from the cache. Returns: list of dict, representing pathways. dict, if a specific pathway is requested. """ missing_details_msg_tpl = 'Failed to get details for credit pathway {id} from the cache.' if pathway_id: pathway = cache.get(PATHWAY_CACHE_KEY_TPL.format(id=pathway_id)) if not pathway: logger.warning(missing_details_msg_tpl.format(id=pathway_id)) return pathway pathway_ids = cache.get( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), []) if not pathway_ids: logger.warning('Failed to get credit pathway ids from the cache.') pathways = cache.get_many([ PATHWAY_CACHE_KEY_TPL.format(id=pathway_id) for pathway_id in pathway_ids ]) pathways = pathways.values() # The get_many above sometimes fails to bring back details cached on one or # more Memcached nodes. It doesn't look like these keys are being evicted. # 99% of the time all keys come back, but 1% of the time all the keys stored # on one or more nodes are missing from the result of the get_many. One # get_many may fail to bring these keys back, but a get_many occurring # immediately afterwards will succeed in bringing back all the keys. This # behavior can be mitigated by trying again for the missing keys, which is # what we do here. Splitting the get_many into smaller chunks may also help. missing_ids = set(pathway_ids) - set(pathway['id'] for pathway in pathways) if missing_ids: logger.info( 'Failed to get details for {count} pathways. Retrying.'.format( count=len(missing_ids))) retried_pathways = cache.get_many([ PATHWAY_CACHE_KEY_TPL.format(id=pathway_id) for pathway_id in missing_ids ]) pathways += retried_pathways.values() still_missing_ids = set(pathway_ids) - set(pathway['id'] for pathway in pathways) for missing_id in still_missing_ids: logger.warning(missing_details_msg_tpl.format(id=missing_id)) return pathways
def test_get_many(self, mock_warning, mock_info): pathways = PathwayFactory.create_batch(3) # Cache details for 2 of 3 programs. partial_pathways = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in pathways[:2] } cache.set_many(partial_pathways, None) # When called before pathways are cached, the function should return an # empty list and log a warning. assert get_pathways(self.site) == [] mock_warning.assert_called_once_with( 'Failed to get credit pathway ids from the cache.') mock_warning.reset_mock() # Cache all 3 pathways cache.set( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site.domain), [pathway['id'] for pathway in pathways], None) actual_pathways = get_pathways(self.site) # The 2 cached pathways should be returned while info and warning # messages should be logged for the missing one. assert {pathway['id'] for pathway in actual_pathways} ==\ {pathway['id'] for pathway in partial_pathways.values()} mock_info.assert_called_with( 'Failed to get details for 1 pathways. Retrying.') mock_warning.assert_called_with( 'Failed to get details for credit pathway {id} from the cache.'. format(id=pathways[2]['id'])) mock_warning.reset_mock() # We can't use a set comparison here because these values are dictionaries # and aren't hashable. We've already verified that all pathways came out # of the cache above, so all we need to do here is verify the accuracy of # the data itself. for pathway in actual_pathways: key = PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']) assert pathway == partial_pathways[key] # Cache details for all 3 pathways. all_pathways = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in pathways } cache.set_many(all_pathways, None) actual_pathways = get_pathways(self.site) # All 3 pathways should be returned. assert {pathway['id'] for pathway in actual_pathways} ==\ {pathway['id'] for pathway in all_pathways.values()} assert not mock_warning.called for pathway in actual_pathways: key = PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']) assert pathway == all_pathways[key]
def get_pathways(site, pathway_id=None): """ Read pathways from the cache. The cache is populated by a management command, cache_programs. Arguments: site (Site): django.contrib.sites.models object Keyword Arguments: pathway_id (string): id identifying a specific pathway to read from the cache. Returns: list of dict, representing pathways. dict, if a specific pathway is requested. """ missing_details_msg_tpl = 'Failed to get details for credit pathway {id} from the cache.' if pathway_id: pathway = cache.get(PATHWAY_CACHE_KEY_TPL.format(id=pathway_id)) if not pathway: logger.warning(missing_details_msg_tpl.format(id=pathway_id)) return pathway pathway_ids = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), []) if not pathway_ids: logger.warning('Failed to get credit pathway ids from the cache.') pathways = cache.get_many([PATHWAY_CACHE_KEY_TPL.format(id=pathway_id) for pathway_id in pathway_ids]) pathways = pathways.values() # The get_many above sometimes fails to bring back details cached on one or # more Memcached nodes. It doesn't look like these keys are being evicted. # 99% of the time all keys come back, but 1% of the time all the keys stored # on one or more nodes are missing from the result of the get_many. One # get_many may fail to bring these keys back, but a get_many occurring # immediately afterwards will succeed in bringing back all the keys. This # behavior can be mitigated by trying again for the missing keys, which is # what we do here. Splitting the get_many into smaller chunks may also help. missing_ids = set(pathway_ids) - set(pathway['id'] for pathway in pathways) if missing_ids: logger.info( 'Failed to get details for {count} pathways. Retrying.'.format(count=len(missing_ids)) ) retried_pathways = cache.get_many( [PATHWAY_CACHE_KEY_TPL.format(id=pathway_id) for pathway_id in missing_ids] ) pathways += retried_pathways.values() still_missing_ids = set(pathway_ids) - set(pathway['id'] for pathway in pathways) for missing_id in still_missing_ids: logger.warning(missing_details_msg_tpl.format(id=missing_id)) return pathways
def test_pathways_multiple_pages(self): """ Verify that the command properly caches credit pathways when multiple pages are returned from its endpoint """ UserFactory(username=self.catalog_integration.service_username) new_pathways = PathwayFactory.create_batch(40) for new_pathway in new_pathways: new_pathway['programs'] = [] pathways = self.pathways + new_pathways programs = { PROGRAM_CACHE_KEY_TPL.format(uuid=program['uuid']): program for program in self.programs } self.mock_list() for uuid in self.uuids: program = programs[PROGRAM_CACHE_KEY_TPL.format(uuid=uuid)] self.mock_detail(uuid, program) # mock 3 pages of credit pathways, starting at the last self.mock_pathways(pathways[40:], page_number=3, final=True) self.mock_pathways(pathways[20:40], page_number=2, final=False) self.mock_pathways(pathways[:20], page_number=1, final=False) call_command('cache_programs') pathways_dict = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in pathways } pathway_keys = list(pathways_dict.keys()) cached_pathway_keys = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site_domain)) self.assertEqual( set(cached_pathway_keys), set(pathway_keys) ) cached_pathways = cache.get_many(pathway_keys) self.assertEqual( set(cached_pathways), set(pathways_dict) ) # We can't use a set comparison here because these values are dictionaries # and aren't hashable. We've already verified that all pathways came out # of the cache above, so all we need to do here is verify the accuracy of # the data itself. for key, pathway in cached_pathways.items(): # cached pathways store just program uuids instead of the full programs, transform before comparing pathways_dict[key]['program_uuids'] = [program['uuid'] for program in pathways_dict[key]['programs']] del pathways_dict[key]['programs'] self.assertEqual(pathway, pathways_dict[key])
def test_pathways_multiple_pages(self): """ Verify that the command properly caches credit pathways when multiple pages are returned from its endpoint """ UserFactory(username=self.catalog_integration.service_username) new_pathways = PathwayFactory.create_batch(40) for new_pathway in new_pathways: new_pathway['programs'] = [] pathways = self.pathways + new_pathways programs = { PROGRAM_CACHE_KEY_TPL.format(uuid=program['uuid']): program for program in self.programs } self.mock_list() for uuid in self.uuids: program = programs[PROGRAM_CACHE_KEY_TPL.format(uuid=uuid)] self.mock_detail(uuid, program) # mock 3 pages of credit pathways, starting at the last self.mock_pathways(pathways[40:], page_number=3, final=True) self.mock_pathways(pathways[20:40], page_number=2, final=False) self.mock_pathways(pathways[:20], page_number=1, final=False) call_command('cache_programs') pathways_dict = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in pathways } pathway_keys = list(pathways_dict.keys()) cached_pathway_keys = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site_domain)) self.assertEqual( set(cached_pathway_keys), set(pathway_keys) ) cached_pathways = cache.get_many(pathway_keys) self.assertEqual( set(cached_pathways), set(pathways_dict) ) # We can't use a set comparison here because these values are dictionaries # and aren't hashable. We've already verified that all pathways came out # of the cache above, so all we need to do here is verify the accuracy of # the data itself. for key, pathway in cached_pathways.items(): # cached pathways store just program uuids instead of the full programs, transform before comparing pathways_dict[key]['program_uuids'] = [program['uuid'] for program in pathways_dict[key]['programs']] del pathways_dict[key]['programs'] self.assertEqual(pathway, pathways_dict[key])
def test_handle_pathways(self): """ Verify that the command requests and caches credit pathways """ UserFactory(username=self.catalog_integration.service_username) programs = { PROGRAM_CACHE_KEY_TPL.format(uuid=program['uuid']): program for program in self.programs } pathways = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in self.pathways } self.mock_list() self.mock_pathways(self.pathways) for uuid in self.uuids: program = programs[PROGRAM_CACHE_KEY_TPL.format(uuid=uuid)] self.mock_detail(uuid, program) call_command('cache_programs') cached_pathway_keys = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site_domain)) pathway_keys = list(pathways.keys()) self.assertEqual( set(cached_pathway_keys), set(pathway_keys) ) cached_pathways = cache.get_many(pathway_keys) self.assertEqual( set(cached_pathways), set(pathways) ) # We can't use a set comparison here because these values are dictionaries # and aren't hashable. We've already verified that all pathways came out # of the cache above, so all we need to do here is verify the accuracy of # the data itself. for key, pathway in cached_pathways.items(): # cached pathways store just program uuids instead of the full programs, transform before comparing pathways[key]['program_uuids'] = [program['uuid'] for program in pathways[key]['programs']] del pathways[key]['programs'] self.assertEqual(pathway, pathways[key])
def test_handle_pathways(self): """ Verify that the command requests and caches credit pathways """ UserFactory(username=self.catalog_integration.service_username) programs = { PROGRAM_CACHE_KEY_TPL.format(uuid=program['uuid']): program for program in self.programs } pathways = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in self.pathways } self.mock_list() self.mock_pathways(self.pathways) for uuid in self.uuids: program = programs[PROGRAM_CACHE_KEY_TPL.format(uuid=uuid)] self.mock_detail(uuid, program) call_command('cache_programs') cached_pathway_keys = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site_domain)) pathway_keys = list(pathways.keys()) self.assertEqual( set(cached_pathway_keys), set(pathway_keys) ) cached_pathways = cache.get_many(pathway_keys) self.assertEqual( set(cached_pathways), set(pathways) ) # We can't use a set comparison here because these values are dictionaries # and aren't hashable. We've already verified that all pathways came out # of the cache above, so all we need to do here is verify the accuracy of # the data itself. for key, pathway in cached_pathways.items(): # cached pathways store just program uuids instead of the full programs, transform before comparing pathways[key]['program_uuids'] = [program['uuid'] for program in pathways[key]['programs']] del pathways[key]['programs'] self.assertEqual(pathway, pathways[key])
def test_handle_missing_pathways(self): """ Verify that the command raises an exception when it fails to retrieve pathways. """ UserFactory(username=self.catalog_integration.service_username) programs = { PROGRAM_CACHE_KEY_TPL.format(uuid=program['uuid']): program for program in self.programs } self.mock_list() for uuid in self.uuids: program = programs[PROGRAM_CACHE_KEY_TPL.format(uuid=uuid)] self.mock_detail(uuid, program) with self.assertRaises(SystemExit) as context: call_command('cache_programs') self.assertEqual(context.exception.code, 1) cached_pathways = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site_domain)) self.assertEqual(cached_pathways, [])
def test_handle_missing_pathways(self): """ Verify that the command raises an exception when it fails to retrieve pathways. """ UserFactory(username=self.catalog_integration.service_username) programs = { PROGRAM_CACHE_KEY_TPL.format(uuid=program['uuid']): program for program in self.programs } self.mock_list() for uuid in self.uuids: program = programs[PROGRAM_CACHE_KEY_TPL.format(uuid=uuid)] self.mock_detail(uuid, program) with self.assertRaises(SystemExit) as context: call_command('cache_programs') self.assertEqual(context.exception.code, 1) cached_pathways = cache.get(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site_domain)) self.assertEqual(cached_pathways, [])
def handle(self, *args, **options): failure = False logger.info('populate-multitenant-programs switch is ON') catalog_integration = CatalogIntegration.current() username = catalog_integration.service_username try: user = User.objects.get(username=username) except User.DoesNotExist: logger.error( 'Failed to create API client. Service user {username} does not exist.'.format(username=username) ) raise programs = {} pathways = {} for site in Site.objects.all(): site_config = getattr(site, 'configuration', None) if site_config is None or not site_config.get_value('COURSE_CATALOG_API_URL'): logger.info('Skipping site {domain}. No configuration.'.format(domain=site.domain)) cache.set(SITE_PROGRAM_UUIDS_CACHE_KEY_TPL.format(domain=site.domain), [], None) cache.set(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), [], None) continue client = create_catalog_api_client(user, site=site) uuids, program_uuids_failed = self.get_site_program_uuids(client, site) new_programs, program_details_failed = self.fetch_program_details(client, uuids) new_pathways, pathways_failed = self.get_pathways(client, site) new_pathways, new_programs, pathway_processing_failed = self.process_pathways(site, new_pathways, new_programs) if program_uuids_failed or program_details_failed or pathways_failed or pathway_processing_failed: failure = True programs.update(new_programs) pathways.update(new_pathways) logger.info('Caching UUIDs for {total} programs for site {site_name}.'.format( total=len(uuids), site_name=site.domain, )) cache.set(SITE_PROGRAM_UUIDS_CACHE_KEY_TPL.format(domain=site.domain), uuids, None) pathway_ids = new_pathways.keys() logger.info('Caching ids for {total} credit pathways for site {site_name}.'.format( total=len(pathway_ids), site_name=site.domain, )) cache.set(SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), pathway_ids, None) successful_programs = len(programs) logger.info('Caching details for {successful_programs} programs.'.format( successful_programs=successful_programs)) cache.set_many(programs, None) successful_pathways = len(pathways) logger.info('Caching details for {successful_pathways} credit pathways.'.format( successful_pathways=successful_pathways)) cache.set_many(pathways, None) if failure: # This will fail a Jenkins job running this command, letting site # operators know that there was a problem. sys.exit(1)
def handle(self, *args, **options): # lint-amnesty, pylint: disable=too-many-statements failure = False logger.info('populate-multitenant-programs switch is ON') catalog_integration = CatalogIntegration.current() username = catalog_integration.service_username try: user = User.objects.get(username=username) except User.DoesNotExist: logger.exception( f'Failed to create API client. Service user {username} does not exist.' ) raise programs = {} pathways = {} courses = {} catalog_courses = {} programs_by_type = {} programs_by_type_slug = {} organizations = {} for site in Site.objects.all(): site_config = getattr(site, 'configuration', None) if site_config is None or not site_config.get_value( 'COURSE_CATALOG_API_URL'): logger.info(f'Skipping site {site.domain}. No configuration.') cache.set( SITE_PROGRAM_UUIDS_CACHE_KEY_TPL.format( domain=site.domain), [], None) cache.set( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), [], None) continue client = create_catalog_api_client(user, site=site) uuids, program_uuids_failed = self.get_site_program_uuids( client, site) new_programs, program_details_failed = self.fetch_program_details( client, uuids) new_pathways, pathways_failed = self.get_pathways(client, site) new_pathways, new_programs, pathway_processing_failed = self.process_pathways( site, new_pathways, new_programs) failure = any([ program_uuids_failed, program_details_failed, pathways_failed, pathway_processing_failed, ]) programs.update(new_programs) pathways.update(new_pathways) courses.update(self.get_courses(new_programs)) catalog_courses.update(self.get_catalog_courses(new_programs)) programs_by_type.update( self.get_programs_by_type(site, new_programs)) programs_by_type_slug.update( self.get_programs_by_type_slug(site, new_programs)) organizations.update( self.get_programs_by_organization(new_programs)) logger.info( 'Caching UUIDs for {total} programs for site {site_name}.'. format( total=len(uuids), site_name=site.domain, )) cache.set( SITE_PROGRAM_UUIDS_CACHE_KEY_TPL.format(domain=site.domain), uuids, None) pathway_ids = list(new_pathways.keys()) logger.info( 'Caching ids for {total} pathways for site {site_name}.'. format( total=len(pathway_ids), site_name=site.domain, )) cache.set( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), pathway_ids, None) logger.info(f'Caching details for {len(programs)} programs.') cache.set_many(programs, None) logger.info(f'Caching details for {len(pathways)} pathways.') cache.set_many(pathways, None) logger.info(f'Caching programs uuids for {len(courses)} courses.') cache.set_many(courses, None) logger.info( f'Caching programs uuids for {len(catalog_courses)} catalog courses.' ) cache.set_many(catalog_courses, None) logger.info( str(f'Caching program UUIDs by {len(programs_by_type)} program types.' )) cache.set_many(programs_by_type, None) logger.info( str(f'Caching program UUIDs by {len(programs_by_type_slug)} program type slugs.' )) cache.set_many(programs_by_type_slug, None) logger.info( f'Caching programs uuids for {len(organizations)} organizations') cache.set_many(organizations, None) if failure: sys.exit(1)
def test_get_many(self, mock_warning, mock_info): pathways = PathwayFactory.create_batch(3) # Cache details for 2 of 3 programs. partial_pathways = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in pathways[:2] } cache.set_many(partial_pathways, None) # When called before pathways are cached, the function should return an # empty list and log a warning. self.assertEqual(get_pathways(self.site), []) mock_warning.assert_called_once_with('Failed to get credit pathway ids from the cache.') mock_warning.reset_mock() # Cache all 3 pathways cache.set( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=self.site.domain), [pathway['id'] for pathway in pathways], None ) actual_pathways = get_pathways(self.site) # The 2 cached pathways should be returned while info and warning # messages should be logged for the missing one. self.assertEqual( set(pathway['id'] for pathway in actual_pathways), set(pathway['id'] for pathway in partial_pathways.values()) ) mock_info.assert_called_with('Failed to get details for 1 pathways. Retrying.') mock_warning.assert_called_with( 'Failed to get details for credit pathway {id} from the cache.'.format(id=pathways[2]['id']) ) mock_warning.reset_mock() # We can't use a set comparison here because these values are dictionaries # and aren't hashable. We've already verified that all pathways came out # of the cache above, so all we need to do here is verify the accuracy of # the data itself. for pathway in actual_pathways: key = PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']) self.assertEqual(pathway, partial_pathways[key]) # Cache details for all 3 pathways. all_pathways = { PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']): pathway for pathway in pathways } cache.set_many(all_pathways, None) actual_pathways = get_pathways(self.site) # All 3 pathways should be returned. self.assertEqual( set(pathway['id'] for pathway in actual_pathways), set(pathway['id'] for pathway in all_pathways.values()) ) self.assertFalse(mock_warning.called) for pathway in actual_pathways: key = PATHWAY_CACHE_KEY_TPL.format(id=pathway['id']) self.assertEqual(pathway, all_pathways[key])
def handle(self, *args, **options): failure = False logger.info('populate-multitenant-programs switch is ON') catalog_integration = CatalogIntegration.current() username = catalog_integration.service_username try: user = User.objects.get(username=username) except User.DoesNotExist: logger.exception( u'Failed to create API client. Service user {username} does not exist.' .format(username=username)) raise programs = {} pathways = {} courses = {} for site in Site.objects.all(): site_config = getattr(site, 'configuration', None) if site_config is None or not site_config.get_value( 'COURSE_CATALOG_API_URL'): logger.info( u'Skipping site {domain}. No configuration.'.format( domain=site.domain)) cache.set( SITE_PROGRAM_UUIDS_CACHE_KEY_TPL.format( domain=site.domain), [], None) cache.set( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), [], None) continue client = create_catalog_api_client(user, site=site) uuids, program_uuids_failed = self.get_site_program_uuids( client, site) new_programs, program_details_failed = self.fetch_program_details( client, uuids) new_pathways, pathways_failed = self.get_pathways(client, site) new_pathways, new_programs, pathway_processing_failed = self.process_pathways( site, new_pathways, new_programs) new_courses, courses_failed = self.get_courses(new_programs) failure = any([ program_uuids_failed, program_details_failed, pathways_failed, pathway_processing_failed, courses_failed, ]) programs.update(new_programs) pathways.update(new_pathways) courses.update(new_courses) logger.info( u'Caching UUIDs for {total} programs for site {site_name}.'. format( total=len(uuids), site_name=site.domain, )) cache.set( SITE_PROGRAM_UUIDS_CACHE_KEY_TPL.format(domain=site.domain), uuids, None) pathway_ids = list(new_pathways.keys()) logger.info( u'Caching ids for {total} pathways for site {site_name}.'. format( total=len(pathway_ids), site_name=site.domain, )) cache.set( SITE_PATHWAY_IDS_CACHE_KEY_TPL.format(domain=site.domain), pathway_ids, None) successful_programs = len(programs) logger.info( u'Caching details for {successful_programs} programs.'.format( successful_programs=successful_programs)) cache.set_many(programs, None) successful_pathways = len(pathways) logger.info( u'Caching details for {successful_pathways} pathways.'.format( successful_pathways=successful_pathways)) cache.set_many(pathways, None) successful_courses = len(courses) logger.info( u'Caching programs uuids for {successful_courses} courses.'.format( successful_courses=successful_courses)) cache.set_many(courses, None) if failure: # This will fail a Jenkins job running this command, letting site # operators know that there was a problem. sys.exit(1)