def test_search_from_url(self): """ ensure that we get the error back when the backend fails """ searcher = SearchEngine.get_search_engine(TEST_INDEX_NAME) searcher.index( "courseware_content", [ { "id": "FAKE_ID_1", "content": { "text": "Little Darling, it's been a long long lonely winter" } } ] ) searcher.index( "courseware_content", [ { "id": "FAKE_ID_2", "content": { "text": "Little Darling, it's been a year since sun been gone" } } ] ) searcher.index("test_doc", [{"id": "FAKE_ID_3", "content": {"text": "Here comes the sun"}}]) code, results = post_request({"search_string": "sun"}) self.assertGreater(code, 499) self.assertEqual(results["error"], 'An error occurred when searching for "sun"') with self.assertRaises(StandardError): searcher.search(query_string="test search")
def _display_name_filter(self, library, display_name): """ Filters library children by capa type""" search_engine = SearchEngine.get_search_engine(index="library_index") if search_engine: log.warn("search engine found") filter_clause = { "library": unicode(normalize_key_for_search(library.location.library_key)), # "content_type": CapaDescriptor.INDEX_CONTENT_TYPE, # "display_name": display_name } search_result = search_engine.search(field_dictionary=filter_clause) new_results = search_result.get('results', []) results = [] for r in new_results: v = self.deep_search(["display_name"], r) if v['display_name'] == display_name: results.append(r) return [LibraryUsageLocator.from_string(item['data']['id']) for item in results] else: log.warn("search engine NOT found") #return [key for key in library.children if self._filter_child_name(key, display_name)] results = [] for r in library.children: p = self.store.get_item(r, 1) v = {} for field in p.fields.values(): v[field.name] = field.read_json(p) # v = p.get_explicitly_set_fields_by_scope(Scope.settings) if v.get('display_name') == display_name: results.append(r) return results
def test_abstract_impl(self): """ Make sure that if one tries to use the abstract base, then operations yeild NotImplementedError s """ abstract = SearchEngine("test_index_name") test_string = "A test string" with self.assertRaises(NotImplementedError): abstract.index([{"name": test_string}]) with self.assertRaises(NotImplementedError): abstract.search(test_string) with self.assertRaises(NotImplementedError): abstract.remove(["test_id"])
def remove_deleted_items(cls, structure_key): """ Remove item from Course About Search_index """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return response = searcher.search(field_dictionary=cls._get_location_info(structure_key)) result_ids = [result["data"]["id"] for result in response["results"]] searcher.remove(result_ids)
def engine(cls): """ Return course team search engine (if feature is enabled). """ try: return SearchEngine.get_search_engine(index=cls.INDEX_NAME) except ConnectionError as err: logging.error('Error connecting to elasticsearch: %s', err) raise ElasticSearchConnectionError # lint-amnesty, pylint: disable=raise-missing-from
def engine(cls): """ Return course team search engine (if feature is enabled). """ try: return SearchEngine.get_search_engine(index=cls.INDEX_NAME) except ConnectionError as err: logging.error('Error connecting to elasticsearch: %s', err) raise ElasticSearchConnectionError
def engine(cls): """ Return course team search engine (if feature is enabled). """ try: return SearchEngine.get_search_engine(index=cls.INDEX_NAME) except ConnectionError as err: logging.error(u'Error connecting to elasticsearch: %s', err) raise ElasticSearchConnectionError
def test_search_from_url(self): """ ensure that we get the error back when the backend fails """ searcher = SearchEngine.get_search_engine(TEST_INDEX_NAME) with self.assertRaises(StandardError): searcher.index("courseware_content", [{ "id": "FAKE_ID_3", "content": { "text": "Here comes the sun" } }])
def remove_all_libraries(cls): """ Remove all libraries from the index """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) response = searcher.search(doc_type=cls.LIBRARY_DOCUMENT_TYPE, filter_dictionary={}, size=MAX_SIZE) ids = [result["data"]["id"] for result in response["results"]] searcher.remove(cls.LIBRARY_DOCUMENT_TYPE, ids)
def _perform_elastic_search(cls, filter_terms, text_search): """ Build a query and search directly on elasticsearch """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) return _translate_hits(searcher._es.search( # pylint: disable=protected-access index=searcher.index_name, body=cls.build_elastic_query(filter_terms, text_search), size=MAX_SIZE ))
def remove_all_items(cls): """ Remove all items from the index """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) response = searcher.search(doc_type=cls.DOCUMENT_TYPE, filter_dictionary={}, size=MAX_SIZE) ids = [result["data"]["id"] for result in response["results"]] searcher.remove(cls.DOCUMENT_TYPE, ids, **cls.SEARCH_KWARGS)
def handle(self, *args, **options): """ By convention set by Django developers, this method actually executes command's actions. So, there could be no better docstring than emphasize this once again. """ all_option = options.get('all', False) setup_option = options.get('setup', False) index_all_courses_option = all_option or setup_option if len(args) == 0 and not index_all_courses_option: raise CommandError( u"reindex_course requires one or more arguments: <course_id>") store = modulestore() if index_all_courses_option: index_name = CoursewareSearchIndexer.INDEX_NAME doc_type = CoursewareSearchIndexer.DOCUMENT_TYPE if setup_option: try: # try getting the ElasticSearch engine searcher = SearchEngine.get_search_engine(index_name) except exceptions.ElasticsearchException as exc: logging.exception('Search Engine error - %s', unicode(exc)) return index_exists = searcher._es.indices.exists(index=index_name) # pylint: disable=protected-access doc_type_exists = searcher._es.indices.exists_type( # pylint: disable=protected-access index=index_name, doc_type=doc_type) index_mapping = searcher._es.indices.get_mapping( # pylint: disable=protected-access index=index_name, doc_type=doc_type ) if index_exists and doc_type_exists else {} if index_exists and index_mapping: return # if reindexing is done during devstack setup step, don't prompt the user if setup_option or query_yes_no(self.CONFIRMATION_PROMPT, default="no"): # in case of --setup or --all, get the list of course keys from all courses # that are stored in the modulestore course_keys = [ course.id for course in modulestore().get_courses() ] else: return else: # in case course keys are provided as arguments course_keys = map(self._parse_course_key, args) for course_key in course_keys: CoursewareSearchIndexer.do_course_reindex(store, course_key)
def delete_course_task(user_id, course_key_string): profile = UserProfile.objects.get(pk=user_id) user = User.objects.get(pk=profile.user_id) course_key = CourseKey.from_string(course_key_string) delete_course_and_groups(course_key, user.id) searcher = SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME) if searcher != None: CoursewareSearchIndexer.remove_deleted_items(searcher, CourseKey.from_string(course_key_string), []) searcher.remove(CourseAboutSearchIndexer.DISCOVERY_DOCUMENT_TYPE, [course_key_string])
def handle(self, *args, **options): """ By convention set by Django developers, this method actually executes command's actions. So, there could be no better docstring than emphasize this once again. """ course_ids = options['course_ids'] all_option = options['all'] setup_option = options['setup'] index_all_courses_option = all_option or setup_option if (not len(course_ids) and not index_all_courses_option) or \ (len(course_ids) and index_all_courses_option): raise CommandError("reindex_course requires one or more <course_id>s OR the --all or --setup flags.") store = modulestore() if index_all_courses_option: index_name = CoursewareSearchIndexer.INDEX_NAME doc_type = CoursewareSearchIndexer.DOCUMENT_TYPE if setup_option: try: # try getting the ElasticSearch engine searcher = SearchEngine.get_search_engine(index_name) except exceptions.ElasticsearchException as exc: logging.exception(u'Search Engine error - %s', exc) return index_exists = searcher._es.indices.exists(index=index_name) # pylint: disable=protected-access doc_type_exists = searcher._es.indices.exists_type( # pylint: disable=protected-access index=index_name, doc_type=doc_type ) index_mapping = searcher._es.indices.get_mapping( # pylint: disable=protected-access index=index_name, doc_type=doc_type ) if index_exists and doc_type_exists else {} if index_exists and index_mapping: return # if reindexing is done during devstack setup step, don't prompt the user if setup_option or query_yes_no(self.CONFIRMATION_PROMPT, default="no"): # in case of --setup or --all, get the list of course keys from all courses # that are stored in the modulestore course_keys = [course.id for course in modulestore().get_courses()] else: return else: # in case course keys are provided as arguments course_keys = map(self._parse_course_key, course_ids) for course_key in course_keys: CoursewareSearchIndexer.do_course_reindex(store, course_key)
def test_task_indexing_course(self): """ Making sure that the receiver correctly fires off the task when invoked by signal """ searcher = SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME) response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) self.assertEqual(response["total"], 0) listen_for_course_publish(self, self.course.id) # Note that this test will only succeed if celery is working in inline mode response = searcher.search(field_dictionary={"course": unicode(self.course.id)}) self.assertEqual(response["total"], 3)
def setUp(self): """ Set up tests. """ super(ReindexCourseTeamTest, self).setUp() self.team1 = CourseTeamFactory(course_id=COURSE_KEY1, team_id='team1') self.team2 = CourseTeamFactory(course_id=COURSE_KEY1, team_id='team2') self.team3 = CourseTeamFactory(course_id=COURSE_KEY1, team_id='team3') self.search_engine = SearchEngine.get_search_engine(index='index_course_team')
def index_course_programs(course_id): """ reindex only the program that containts course """ INDEX_NAME = "courseware_index" searcher = SearchEngine.get_search_engine(INDEX_NAME) if not searcher: return course_key = CourseKey.from_string(course_id) programs = Program.objects.filter(courses__course_key=course_key) for program in programs: program_banner_image_url = program.banner_image.url if program.banner_image else '/static/images/banner-black-background.jpg' if program.start <= datetime.now(pytz.UTC).date(): program_info = { 'id': program.id, 'course': program.id, 'content': { 'display_name': program.name, 'overview': program.short_description }, 'image_url': program_banner_image_url, 'start': program.start, 'language': program.language.code, 'subject': program.subject.name, 'is_program': True, } # Broad exception handler to protect around and report problems with indexing try: searcher.index('course_info', [program_info]) except: # pylint: disable=bare-except log.exception( "Program discovery indexing error encountered %s", program_info.get('id', ''), ) return JsonResponse({ 'success': False, 'error': 'Program discovery indexing error encountered ', 'Program': program.name }) else: return JsonResponse({ 'success': False, 'error': 'This ' + program.name + ' program start date is in future: ' + str(program.start), 'suggestion': 'Please set past progrm start date', 'Program': program.name }) return True
def test_task_library_update(self): """ Making sure that the receiver correctly fires off the task when invoked by signal """ searcher = SearchEngine.get_search_engine(LibrarySearchIndexer.INDEX_NAME) library_search_key = unicode(normalize_key_for_search(self.library.location.library_key)) response = searcher.search(field_dictionary={"library": library_search_key}) self.assertEqual(response["total"], 0) listen_for_library_update(self, self.library.location.library_key) # Note that this test will only succeed if celery is working in inline mode response = searcher.search(field_dictionary={"library": library_search_key}) self.assertEqual(response["total"], 2)
def test_abstract_impl(self): """ Make sure that if one tries to use the abstract base, then operations yeild NotImplementedError s """ abstract = SearchEngine("test_index_name") test_string = "A test string" with self.assertRaises(NotImplementedError): abstract.index("test_doc", [{"name": test_string}]) with self.assertRaises(NotImplementedError): abstract.search(test_string) with self.assertRaises(NotImplementedError): abstract.remove("test_doc", ["test_id"])
def remove_deleted_items(cls, structure_key): """ Remove item from Course About Search_index """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return response = searcher.search( doc_type=cls.DISCOVERY_DOCUMENT_TYPE, field_dictionary=cls._get_location_info(structure_key) ) result_ids = [result["data"]["id"] for result in response["results"]] searcher.remove(cls.DISCOVERY_DOCUMENT_TYPE, result_ids)
def setUp(self): """ Set up tests. """ super(ReindexCourseTeamTest, self).setUp() # lint-amnesty, pylint: disable=super-with-arguments self.team1 = CourseTeamFactory(course_id=COURSE_KEY1, team_id='team1') self.team2 = CourseTeamFactory(course_id=COURSE_KEY1, team_id='team2') self.team3 = CourseTeamFactory(course_id=COURSE_KEY1, team_id='team3') self.search_engine = SearchEngine.get_search_engine( index='index_course_team')
def index_programs_information(request): """ Add all the program to the course discovery index """ INDEX_NAME = "courseware_index" searcher = SearchEngine.get_search_engine(INDEX_NAME) if not searcher: return programs = Program.objects.all() for program in programs: if program.start <= datetime.now(pytz.UTC).date(): program_info = { 'id': program.id, 'course': program.id, 'content': { 'display_name': program.name, 'overview': program.short_description }, 'image_url': program.banner_image.url, 'start': program.start, 'language': program.language.code, 'subject': program.subject.name, 'is_program': True, } # Broad exception handler to protect around and report problems with indexing try: searcher.index('course_info', [program_info]) except: # pylint: disable=bare-except log.exception( "Program discovery indexing error encountered %s", program_info.get('id', ''), ) return JsonResponse({ 'success': False, 'error': 'Program discovery indexing error encountered ', 'Program': program.name }) else: return JsonResponse({ 'success': False, 'error': 'This ' + program.name + ' program start date is in future: ' + str(program.start), 'suggestion': 'Please set past progrm start date', 'Program': program.name }) return redirect(reverse('show-programs'))
def _problem_type_filter(self, library, capa_type): """ Filters library children by capa type""" search_engine = SearchEngine.get_search_engine(index="library_index") if search_engine: filter_clause = { "library": six.text_type(normalize_key_for_search(library.location.library_key)), "content_type": ProblemBlock.INDEX_CONTENT_TYPE, "problem_types": capa_type } search_result = search_engine.search(field_dictionary=filter_clause) results = search_result.get('results', []) return [LibraryUsageLocator.from_string(item['data']['id']) for item in results] else: return [key for key in library.children if self._filter_child(key, capa_type)]
def handle(self, *args, **options): """ By convention set by Django developers, this method actually executes command's actions. So, there could be no better docstring than emphasize this once again. """ course_ids = options['course_ids'] all_option = options['all'] setup_option = options['setup'] index_all_courses_option = all_option or setup_option if (not len(course_ids) and not index_all_courses_option) or (len(course_ids) and index_all_courses_option): # lint-amnesty, pylint: disable=len-as-condition raise CommandError("reindex_course requires one or more <course_id>s OR the --all or --setup flags.") store = modulestore() if index_all_courses_option: index_names = (CoursewareSearchIndexer.INDEX_NAME, CourseAboutSearchIndexer.INDEX_NAME) if setup_option: for index_name in index_names: try: searcher = SearchEngine.get_search_engine(index_name) except exceptions.ElasticsearchException as exc: logging.exception('Search Engine error - %s', exc) return index_exists = searcher._es.indices.exists(index=index_name) # pylint: disable=protected-access index_mapping = searcher._es.indices.get_mapping( # pylint: disable=protected-access index=index_name, ) if index_exists else {} if index_exists and index_mapping: return # if reindexing is done during devstack setup step, don't prompt the user if setup_option or query_yes_no(self.CONFIRMATION_PROMPT, default="no"): # in case of --setup or --all, get the list of course keys from all courses # that are stored in the modulestore course_keys = [course.id for course in modulestore().get_courses()] else: return else: # in case course keys are provided as arguments course_keys = list(map(self._parse_course_key, course_ids)) for course_key in course_keys: try: CoursewareSearchIndexer.do_course_reindex(store, course_key) except Exception as exc: # lint-amnesty, pylint: disable=broad-except logging.exception('Error indexing course %s due to the error: %s', course_key, exc)
def test_task_indexing_course(self): """ Making sure that the receiver correctly fires off the task when invoked by signal """ searcher = SearchEngine.get_search_engine( CoursewareSearchIndexer.INDEX_NAME) response = searcher.search( field_dictionary={"course": unicode(self.course.id)}) self.assertEqual(response["total"], 0) listen_for_course_publish(self, self.course.id) # Note that this test will only succeed if celery is working in inline mode response = searcher.search( field_dictionary={"course": unicode(self.course.id)}) self.assertEqual(response["total"], 3)
def _problem_type_filter(self, library, capa_type): """ Filters library children by capa type""" search_engine = SearchEngine.get_search_engine(index="library_index") if search_engine: filter_clause = { "library": unicode(normalize_key_for_search(library.location.library_key)), "content_type": CapaDescriptor.INDEX_CONTENT_TYPE, "problem_types": capa_type } search_result = search_engine.search(field_dictionary=filter_clause) results = search_result.get('results', []) return [LibraryUsageLocator.from_string(item['data']['id']) for item in results] else: return [key for key in library.children if self._filter_child(key, capa_type)]
def reindex_specific_program(request, pk): """ reindex specific program """ INDEX_NAME = "courseware_index" searcher = SearchEngine.get_search_engine(INDEX_NAME) if not searcher: return try: program = Program.objects.get(pk=pk) except Exception, e: return JsonResponse({ 'success': False, 'error': 'Program not found in this id ' + pk, })
def delete_temp_user_task(request, user_id): profile = UserProfile.objects.get(pk=user_id) user = User.objects.get(pk=profile.user_id) courses = [format_course_for_view(c) for c in get_courses_accessible_to_user(request, user)[0]] libraries = [format_library_for_view(lib, user) for lib in accessible_libraries_list(user)] for course in courses: course_key = CourseKey.from_string(course["course_key"]) delete_course_and_groups(course_key, user.id) searcher = SearchEngine.get_search_engine(CoursewareSearchIndexer.INDEX_NAME) if searcher != None: CoursewareSearchIndexer.remove_deleted_items(searcher, CourseKey.from_string(course_key_string), []) searcher.remove(CourseAboutSearchIndexer.DISCOVERY_DOCUMENT_TYPE, [course_key_string]) for library in libraries: library_key = CourseKey.from_string(library['library_key']) delete_course_and_groups(library_key, user.id)
def index_libraries(cls, library_keys): """ Index the specified libraries. If they already exist, replace them with new ones. """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) library_dicts = [] for library_key in library_keys: ref = ContentLibrary.objects.get_by_key(library_key) lib_bundle = LibraryBundle(library_key, ref.bundle_uuid, draft_name=DRAFT_NAME) num_blocks = len(lib_bundle.get_top_level_usages()) last_published = lib_bundle.get_last_published_time() last_published_str = None if last_published: last_published_str = last_published.strftime( '%Y-%m-%dT%H:%M:%SZ') (has_unpublished_changes, has_unpublished_deletes) = lib_bundle.has_changes() bundle_metadata = get_bundle(ref.bundle_uuid) # NOTE: Increment ContentLibraryIndexer.SCHEMA_VERSION if the following schema is updated to avoid dealing # with outdated indexes which might cause errors due to missing/invalid attributes. library_dict = { "schema_version": ContentLibraryIndexer.SCHEMA_VERSION, "id": str(library_key), "uuid": str(bundle_metadata.uuid), "title": bundle_metadata.title, "description": bundle_metadata.description, "num_blocks": num_blocks, "version": bundle_metadata.latest_version, "last_published": last_published_str, "has_unpublished_changes": has_unpublished_changes, "has_unpublished_deletes": has_unpublished_deletes, } library_dicts.append(library_dict) return searcher.index(cls.LIBRARY_DOCUMENT_TYPE, library_dicts)
def get_items(cls, ids=None, filter_terms=None, text_search=None): """ Retrieve a list of items from the index. Arguments: ids - List of ids to be searched for in the index filter_terms - Dictionary of filters to be applied text_search - String which is used to do a text search in the supported indexes. """ if filter_terms is None: filter_terms = {} if ids is not None: filter_terms = { "id": [str(item) for item in ids], "schema_version": [cls.SCHEMA_VERSION], **filter_terms, } if text_search: response = cls._perform_elastic_search(filter_terms, text_search) else: searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) response = searcher.search(field_dictionary=filter_terms, size=MAX_SIZE) response = [result["data"] for result in response["results"]] return sorted(response, key=lambda i: i["id"])
def add_to_search_index(modulestore, location, delete=False, raise_on_error=False): """ Add to courseware search index from given location and its children """ error_list = [] # TODO - inline for now, need to move this out to a celery task searcher = SearchEngine.get_search_engine(INDEX_NAME) if not searcher: return if isinstance(location, CourseLocator): course_key = location else: course_key = location.course_key location_info = { "course": unicode(course_key), } def _fetch_item(item_location): """ Fetch the item from the modulestore location, log if not found, but continue """ try: if isinstance(item_location, CourseLocator): item = modulestore.get_course(item_location) else: item = modulestore.get_item(item_location, revision=ModuleStoreEnum.RevisionOption.published_only) except ItemNotFoundError: log.warning('Cannot find: %s', item_location) return None return item def index_item_location(item_location, current_start_date): """ add this item to the search index """ item = _fetch_item(item_location) if not item: return is_indexable = hasattr(item, "index_dictionary") # if it's not indexable and it does not have children, then ignore if not is_indexable and not item.has_children: return # if it has a defined start, then apply it and to it's children if item.start and (not current_start_date or item.start > current_start_date): current_start_date = item.start if item.has_children: for child_loc in item.children: index_item_location(child_loc, current_start_date) item_index = {} item_index_dictionary = item.index_dictionary() if is_indexable else None # if it has something to add to the index, then add it if item_index_dictionary: try: item_index.update(location_info) item_index.update(item_index_dictionary) item_index['id'] = unicode(item.scope_ids.usage_id) if current_start_date: item_index['start_date'] = current_start_date searcher.index(DOCUMENT_TYPE, item_index) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %s', item_location, unicode(err)) error_list.append(_('Could not index item: {}').format(item_location)) def remove_index_item_location(item_location): """ remove this item from the search index """ item = _fetch_item(item_location) if item: if item.has_children: for child_loc in item.children: remove_index_item_location(child_loc) searcher.remove(DOCUMENT_TYPE, unicode(item.scope_ids.usage_id)) try: if delete: remove_index_item_location(location) else: index_item_location(location, None) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %s", course_key, unicode(err) ) error_list.append(_('General indexing error occurred')) if raise_on_error and error_list: raise SearchIndexingError(_('Error(s) present during indexing'), error_list)
def searcher(self): """ cached instance of search engine """ if self._searcher is None: self._searcher = SearchEngine.get_search_engine(TEST_INDEX_NAME) return self._searcher
def index_about_information(cls, modulestore, course): """ Add the given course to the course discovery index Arguments: modulestore - modulestore object to use for operations course - course object from which to take properties, locate about information """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return course_id = unicode(course.id) course_info = { 'id': course_id, 'course': course_id, 'content': {}, 'image_url': course_image_url(course), } # load data for all of the 'about' modules for this course into a dictionary about_dictionary = { item.location.name: item.data for item in modulestore.get_items(course.id, qualifiers={"category": "about"}) } about_context = { "course": course, "about_dictionary": about_dictionary, } for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE: # Broad exception handler so that a single bad property does not scupper the collection of others try: section_content = about_information.get_value(**about_context) except: # pylint: disable=bare-except section_content = None log.warning( "Course discovery could not collect property %s for course %s", about_information.property_name, course_id, exc_info=True, ) if section_content: if about_information.index_flags & AboutInfo.ANALYSE: analyse_content = section_content if isinstance(section_content, basestring): analyse_content = strip_html_content_to_text( section_content) course_info['content'][ about_information.property_name] = analyse_content if about_information.index_flags & AboutInfo.PROPERTY: course_info[ about_information.property_name] = section_content # Broad exception handler to protect around and report problems with indexing try: searcher.index(cls.DISCOVERY_DOCUMENT_TYPE, [course_info]) except: # pylint: disable=bare-except log.exception( "Course discovery indexing error encountered, course discovery index may be out of date %s", course_id, ) raise log.debug("Successfully added %s course to the course discovery index", course_id)
def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE): """ Process course for indexing Arguments: modulestore - modulestore object to use for operations structure_key (CourseKey|LibraryKey) - course or library identifier triggered_at (datetime) - provides time at which indexing was triggered; useful for index updates - only things changed recently from that date (within REINDEX_AGE above ^^) will have their index updated, others skip updating their index but are still walked through in order to identify which items may need to be removed from the index If None, then a full reindex takes place Returns: Number of items that have been added to the index """ error_list = [] searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return structure_key = cls.normalize_structure_key(structure_key) location_info = cls._get_location_info(structure_key) # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `prepare_item_index` indexed_count = {"count": 0} # indexed_items is a list of all the items that we wish to remain in the # index, whether or not we are planning to actually update their index. # This is used in order to build a query to remove those items not in this # list - those are ready to be destroyed indexed_items = set() # items_index is a list of all the items index dictionaries. # it is used to collect all indexes and index them using bulk API, # instead of per item index API call. items_index = [] def get_item_location(item): """ Gets the version agnostic item location """ return item.location.version_agnostic().replace(branch=None) def prepare_item_index(item, skip_index=False, groups_usage_info=None): """ Add this item to the items_index and indexed_items list Arguments: item - item to add to index, its children will be processed recursively skip_index - simply walk the children in the tree, the content change is older than the REINDEX_AGE window and would have been already indexed. This should really only be passed from the recursive child calls when this method has determined that it is safe to do so Returns: item_content_groups - content groups assigned to indexed item """ is_indexable = hasattr(item, "index_dictionary") item_index_dictionary = item.index_dictionary( ) if is_indexable else None # if it's not indexable and it does not have children, then ignore if not item_index_dictionary and not item.has_children: return item_content_groups = None if item.category == "split_test": split_partition = item.get_selected_partition() for split_test_child in item.get_children(): if split_partition: for group in split_partition.groups: group_id = unicode(group.id) child_location = item.group_id_to_child.get( group_id, None) if child_location == split_test_child.location: groups_usage_info.update({ unicode(get_item_location(split_test_child)): [group_id], }) for component in split_test_child.get_children( ): groups_usage_info.update({ unicode(get_item_location(component)): [group_id] }) if groups_usage_info: item_location = get_item_location(item) item_content_groups = groups_usage_info.get( unicode(item_location), None) item_id = unicode(cls._id_modifier(item.scope_ids.usage_id)) indexed_items.add(item_id) if item.has_children: # determine if it's okay to skip adding the children herein based upon how recently any may have changed skip_child_index = skip_index or \ (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) children_groups_usage = [] for child_item in item.get_children(): if modulestore.has_published_version(child_item): children_groups_usage.append( prepare_item_index( child_item, skip_index=skip_child_index, groups_usage_info=groups_usage_info)) if None in children_groups_usage: item_content_groups = None if skip_index or not item_index_dictionary: return item_index = {} # if it has something to add to the index, then add it try: item_index.update(location_info) item_index.update(item_index_dictionary) item_index['id'] = item_id if item.start: item_index['start_date'] = item.start item_index[ 'content_groups'] = item_content_groups if item_content_groups else None item_index.update(cls.supplemental_fields(item)) items_index.append(item_index) indexed_count["count"] += 1 return item_content_groups except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %r', item.location, err) error_list.append( _('Could not index item: {}').format(item.location)) try: with modulestore.branch_setting( ModuleStoreEnum.RevisionOption.published_only): structure = cls._fetch_top_level(modulestore, structure_key) groups_usage_info = cls.fetch_group_usage( modulestore, structure) # First perform any additional indexing from the structure object cls.supplemental_index_information(modulestore, structure) # Now index the content for item in structure.get_children(): prepare_item_index(item, groups_usage_info=groups_usage_info) searcher.index(cls.DOCUMENT_TYPE, items_index) cls.remove_deleted_items(searcher, structure_key, indexed_items) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %r", structure_key, err) error_list.append(_('General indexing error occurred')) if error_list: raise SearchIndexingError('Error(s) present during indexing', error_list) return indexed_count["count"]
def mock_perform(cls, filter_terms, text_search): # pylint: disable=no-member return SearchEngine.get_search_engine(cls.INDEX_NAME).search( field_dictionary=filter_terms, query_string=text_search, size=MAX_SIZE)
def searcher(self): """ Centralized call to getting the search engine for the test """ return SearchEngine.get_search_engine(self.INDEX_NAME)
def setUp(self): super().setUp() ContentLibraryIndexer.remove_all_libraries() self.searcher = SearchEngine.get_search_engine(ContentLibraryIndexer.INDEX_NAME)
def setUp(self): super().setUp() ContentLibraryIndexer.remove_all_items() LibraryBlockIndexer.remove_all_items() self.searcher = SearchEngine.get_search_engine( LibraryBlockIndexer.INDEX_NAME)
def add_to_search_index(modulestore, location, delete=False, raise_on_error=False): """ Add to courseware search index from given location and its children """ error_list = [] # TODO - inline for now, need to move this out to a celery task searcher = SearchEngine.get_search_engine(INDEX_NAME) if not searcher: return if isinstance(location, CourseLocator): course_key = location else: course_key = location.course_key location_info = { "course": unicode(course_key), } def _fetch_item(item_location): """ Fetch the item from the modulestore location, log if not found, but continue """ try: if isinstance(item_location, CourseLocator): item = modulestore.get_course(item_location) else: item = modulestore.get_item( item_location, revision=ModuleStoreEnum.RevisionOption.published_only) except ItemNotFoundError: log.warning('Cannot find: %s', item_location) return None return item def index_item_location(item_location, current_start_date): """ add this item to the search index """ item = _fetch_item(item_location) if not item: return is_indexable = hasattr(item, "index_dictionary") # if it's not indexable and it does not have children, then ignore if not is_indexable and not item.has_children: return # if it has a defined start, then apply it and to it's children if item.start and (not current_start_date or item.start > current_start_date): current_start_date = item.start if item.has_children: for child_loc in item.children: index_item_location(child_loc, current_start_date) item_index = {} item_index_dictionary = item.index_dictionary( ) if is_indexable else None # if it has something to add to the index, then add it if item_index_dictionary: try: item_index.update(location_info) item_index.update(item_index_dictionary) item_index['id'] = unicode(item.scope_ids.usage_id) if current_start_date: item_index['start_date'] = current_start_date searcher.index(DOCUMENT_TYPE, item_index) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %s', item_location, unicode(err)) error_list.append( _('Could not index item: {}').format(item_location)) def remove_index_item_location(item_location): """ remove this item from the search index """ item = _fetch_item(item_location) if item: if item.has_children: for child_loc in item.children: remove_index_item_location(child_loc) searcher.remove(DOCUMENT_TYPE, unicode(item.scope_ids.usage_id)) try: if delete: remove_index_item_location(location) else: index_item_location(location, None) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %s", course_key, unicode(err)) error_list.append(_('General indexing error occurred')) if raise_on_error and error_list: raise SearchIndexingError(_('Error(s) present during indexing'), error_list)
def engine(cls): """ Return course team search engine (if feature is enabled). """ if cls.search_is_enabled(): return SearchEngine.get_search_engine(index=cls.INDEX_NAME)
def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE): """ Process course for indexing Arguments: structure_key (CourseKey|LibraryKey) - course or library identifier triggered_at (datetime) - provides time at which indexing was triggered; useful for index updates - only things changed recently from that date (within REINDEX_AGE above ^^) will have their index updated, others skip updating their index but are still walked through in order to identify which items may need to be removed from the index If None, then a full reindex takes place Returns: Number of items that have been added to the index """ error_list = [] searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return structure_key = cls.normalize_structure_key(structure_key) location_info = cls._get_location_info(structure_key) # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `index_item` indexed_count = {"count": 0} # indexed_items is a list of all the items that we wish to remain in the # index, whether or not we are planning to actually update their index. # This is used in order to build a query to remove those items not in this # list - those are ready to be destroyed indexed_items = set() def index_item(item, skip_index=False): """ Add this item to the search index and indexed_items list Arguments: item - item to add to index, its children will be processed recursively skip_index - simply walk the children in the tree, the content change is older than the REINDEX_AGE window and would have been already indexed. This should really only be passed from the recursive child calls when this method has determined that it is safe to do so """ is_indexable = hasattr(item, "index_dictionary") item_index_dictionary = item.index_dictionary( ) if is_indexable else None # if it's not indexable and it does not have children, then ignore if not item_index_dictionary and not item.has_children: return item_id = unicode(cls._id_modifier(item.scope_ids.usage_id)) indexed_items.add(item_id) if item.has_children: # determine if it's okay to skip adding the children herein based upon how recently any may have changed skip_child_index = skip_index or \ (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) for child_item in item.get_children(): index_item(child_item, skip_index=skip_child_index) if skip_index or not item_index_dictionary: return item_index = {} # if it has something to add to the index, then add it try: item_index.update(location_info) item_index.update(item_index_dictionary) item_index['id'] = item_id if item.start: item_index['start_date'] = item.start searcher.index(cls.DOCUMENT_TYPE, item_index) indexed_count["count"] += 1 except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %r', item.location, err) error_list.append( _('Could not index item: {}').format(item.location)) try: with modulestore.branch_setting( ModuleStoreEnum.RevisionOption.published_only): structure = cls._fetch_top_level(modulestore, structure_key) for item in structure.get_children(): index_item(item) cls.remove_deleted_items(searcher, structure_key, indexed_items) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %r", structure_key, err) error_list.append(_('General indexing error occurred')) if error_list: raise SearchIndexingError('Error(s) present during indexing', error_list) return indexed_count["count"]
def test_search_from_url(self): """ ensure that we get the error back when the backend fails """ searcher = SearchEngine.get_search_engine(TEST_INDEX_NAME) with self.assertRaises(StandardError): searcher.index("courseware_content", [{"id": "FAKE_ID_3", "content": {"text": "Here comes the sun"}}])
def index(cls, modulestore, structure_key, triggered_at=None, reindex_age=REINDEX_AGE): """ Process course for indexing Arguments: modulestore - modulestore object to use for operations structure_key (CourseKey|LibraryKey) - course or library identifier triggered_at (datetime) - provides time at which indexing was triggered; useful for index updates - only things changed recently from that date (within REINDEX_AGE above ^^) will have their index updated, others skip updating their index but are still walked through in order to identify which items may need to be removed from the index If None, then a full reindex takes place Returns: Number of items that have been added to the index """ error_list = [] searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return structure_key = cls.normalize_structure_key(structure_key) location_info = cls._get_location_info(structure_key) # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `prepare_item_index` indexed_count = { "count": 0 } # indexed_items is a list of all the items that we wish to remain in the # index, whether or not we are planning to actually update their index. # This is used in order to build a query to remove those items not in this # list - those are ready to be destroyed indexed_items = set() # items_index is a list of all the items index dictionaries. # it is used to collect all indexes and index them using bulk API, # instead of per item index API call. items_index = [] def get_item_location(item): """ Gets the version agnostic item location """ return item.location.version_agnostic().replace(branch=None) def prepare_item_index(item, skip_index=False, groups_usage_info=None): """ Add this item to the items_index and indexed_items list Arguments: item - item to add to index, its children will be processed recursively skip_index - simply walk the children in the tree, the content change is older than the REINDEX_AGE window and would have been already indexed. This should really only be passed from the recursive child calls when this method has determined that it is safe to do so Returns: item_content_groups - content groups assigned to indexed item """ is_indexable = hasattr(item, "index_dictionary") item_index_dictionary = item.index_dictionary() if is_indexable else None # if it's not indexable and it does not have children, then ignore if not item_index_dictionary and not item.has_children: return item_content_groups = None if item.category == "split_test": split_partition = item.get_selected_partition() for split_test_child in item.get_children(): if split_partition: for group in split_partition.groups: group_id = unicode(group.id) child_location = item.group_id_to_child.get(group_id, None) if child_location == split_test_child.location: groups_usage_info.update({ unicode(get_item_location(split_test_child)): [group_id], }) for component in split_test_child.get_children(): groups_usage_info.update({ unicode(get_item_location(component)): [group_id] }) if groups_usage_info: item_location = get_item_location(item) item_content_groups = groups_usage_info.get(unicode(item_location), None) item_id = unicode(cls._id_modifier(item.scope_ids.usage_id)) indexed_items.add(item_id) if item.has_children: # determine if it's okay to skip adding the children herein based upon how recently any may have changed skip_child_index = skip_index or \ (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) children_groups_usage = [] for child_item in item.get_children(): if modulestore.has_published_version(child_item): children_groups_usage.append( prepare_item_index( child_item, skip_index=skip_child_index, groups_usage_info=groups_usage_info ) ) if None in children_groups_usage: item_content_groups = None if skip_index or not item_index_dictionary: return item_index = {} # if it has something to add to the index, then add it try: item_index.update(location_info) item_index.update(item_index_dictionary) item_index['id'] = item_id if item.start: item_index['start_date'] = item.start item_index['content_groups'] = item_content_groups if item_content_groups else None item_index.update(cls.supplemental_fields(item)) items_index.append(item_index) indexed_count["count"] += 1 return item_content_groups except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %r', item.location, err) error_list.append(_('Could not index item: {}').format(item.location)) try: with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only): structure = cls._fetch_top_level(modulestore, structure_key) groups_usage_info = cls.fetch_group_usage(modulestore, structure) # First perform any additional indexing from the structure object cls.supplemental_index_information(modulestore, structure) # Now index the content for item in structure.get_children(): prepare_item_index(item, groups_usage_info=groups_usage_info) searcher.index(cls.DOCUMENT_TYPE, items_index) cls.remove_deleted_items(searcher, structure_key, indexed_items) except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %r", structure_key, err ) error_list.append(_('General indexing error occurred')) if error_list: raise SearchIndexingError('Error(s) present during indexing', error_list) return indexed_count["count"]
def index_course(cls, modulestore, course_key, triggered_at=None, reindex_age=REINDEX_AGE): """ Process course for indexing Arguments: course_key (CourseKey) - course identifier triggered_at (datetime) - provides time at which indexing was triggered; useful for index updates - only things changed recently from that date (within REINDEX_AGE above ^^) will have their index updated, others skip updating their index but are still walked through in order to identify which items may need to be removed from the index If None, then a full reindex takes place Returns: Number of items that have been added to the index """ error_list = [] searcher = SearchEngine.get_search_engine(INDEX_NAME) if not searcher: return location_info = { "course": unicode(course_key), } # Wrap counter in dictionary - otherwise we seem to lose scope inside the embedded function `index_item` indexed_count = { "count": 0 } # indexed_items is a list of all the items that we wish to remain in the # index, whether or not we are planning to actually update their index. # This is used in order to build a query to remove those items not in this # list - those are ready to be destroyed indexed_items = set() def index_item(item, skip_index=False): """ Add this item to the search index and indexed_items list Arguments: item - item to add to index, its children will be processed recursively skip_index - simply walk the children in the tree, the content change is older than the REINDEX_AGE window and would have been already indexed. This should really only be passed from the recursive child calls when this method has determined that it is safe to do so """ is_indexable = hasattr(item, "index_dictionary") item_index_dictionary = item.index_dictionary() if is_indexable else None # if it's not indexable and it does not have children, then ignore if not item_index_dictionary and not item.has_children: return item_id = unicode(item.scope_ids.usage_id) indexed_items.add(item_id) if item.has_children: # determine if it's okay to skip adding the children herein based upon how recently any may have changed skip_child_index = skip_index or \ (triggered_at is not None and (triggered_at - item.subtree_edited_on) > reindex_age) for child_item in item.get_children(): index_item(child_item, skip_index=skip_child_index) if skip_index or not item_index_dictionary: return item_index = {} # if it has something to add to the index, then add it try: item_index.update(location_info) item_index.update(item_index_dictionary) item_index['id'] = item_id if item.start: item_index['start_date'] = item.start searcher.index(DOCUMENT_TYPE, item_index) indexed_count["count"] += 1 except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not fail on one item of many log.warning('Could not index item: %s - %r', item.location, err) error_list.append(_('Could not index item: {}').format(item.location)) def remove_deleted_items(): """ remove any item that is present in the search index that is not present in updated list of indexed items as we find items we can shorten the set of items to keep """ response = searcher.search( doc_type=DOCUMENT_TYPE, field_dictionary={"course": unicode(course_key)}, exclude_ids=indexed_items ) result_ids = [result["data"]["id"] for result in response["results"]] for result_id in result_ids: searcher.remove(DOCUMENT_TYPE, result_id) try: with modulestore.branch_setting(ModuleStoreEnum.RevisionOption.published_only): course = modulestore.get_course(course_key, depth=None) for item in course.get_children(): index_item(item) remove_deleted_items() except Exception as err: # pylint: disable=broad-except # broad exception so that index operation does not prevent the rest of the application from working log.exception( "Indexing error encountered, courseware index may be out of date %s - %r", course_key, err ) error_list.append(_('General indexing error occurred')) if error_list: raise SearchIndexingError('Error(s) present during indexing', error_list) return indexed_count["count"]
def index_about_information(cls, modulestore, course): """ Add the given course to the course discovery index Arguments: modulestore - modulestore object to use for operations course - course object from which to take properties, locate about information """ searcher = SearchEngine.get_search_engine(cls.INDEX_NAME) if not searcher: return course_id = unicode(course.id) course_info = { 'id': course_id, 'course': course_id, 'content': {}, 'image_url': course_image_url(course), } # load data for all of the 'about' modules for this course into a dictionary about_dictionary = { item.location.name: item.data for item in modulestore.get_items(course.id, qualifiers={"category": "about"}) } about_context = { "course": course, "about_dictionary": about_dictionary, } for about_information in cls.ABOUT_INFORMATION_TO_INCLUDE: # Broad exception handler so that a single bad property does not scupper the collection of others try: section_content = about_information.get_value(**about_context) except: # pylint: disable=bare-except section_content = None log.warning( "Course discovery could not collect property %s for course %s", about_information.property_name, course_id, exc_info=True, ) if section_content: if about_information.index_flags & AboutInfo.ANALYSE: analyse_content = section_content if isinstance(section_content, basestring): analyse_content = strip_html_content_to_text(section_content) course_info['content'][about_information.property_name] = analyse_content if about_information.property_name == "more_info": course_info[about_information.property_name] = analyse_content if about_information.index_flags & AboutInfo.PROPERTY: course_info[about_information.property_name] = section_content # Broad exception handler to protect around and report problems with indexing try: searcher.index(cls.DISCOVERY_DOCUMENT_TYPE, [course_info]) except: # pylint: disable=bare-except log.exception( "Course discovery indexing error encountered, course discovery index may be out of date %s", course_id, ) raise log.debug( "Successfully added %s course to the course discovery index", course_id )