def test_vocabulary_rename(self): """Test that index updates properly after a vocabulary rename.""" vocab_result = self.create_vocabulary(self.repo.slug) vocab_id = vocab_result['id'] vocab_key = make_vocab_key(vocab_id) vocab_slug = vocab_result['slug'] term_result = self.create_term(self.repo.slug, vocab_slug) term_id = term_result['id'] term_slug = term_result['slug'] term_label = term_result['label'] # Update vocabulary for resource type, assign term self.patch_vocabulary(self.repo.slug, vocab_slug, { "learning_resource_types": [ self.resource.learning_resource_type.name ] }) self.patch_learning_resource(self.repo.slug, self.resource.id, { "terms": [term_slug] }) self.assertEqual( self.get_results()['facet_counts'][vocab_key]['facet']['label'], vocab_result['name'] ) self.assertEqual( self.get_results()['facet_counts'][vocab_key]['values'], [{'count': 1, 'key': str(term_id), 'label': term_label}] ) self.assertEqual( self.get_results(selected_facets=["{v}_exact:{t}".format( v=vocab_key, t=term_id )])['count'], 1 ) # Rename vocabulary name = "brand new name" self.patch_vocabulary(self.repo.slug, vocab_slug, { "name": name }) # Facet counts should not change self.assertEqual( self.get_results()['facet_counts'][vocab_key]['facet']['label'], name ) self.assertEqual( self.get_results()['facet_counts'][vocab_key]['values'], [{'count': 1, 'key': str(term_id), 'label': term_label}] ) self.assertEqual( self.get_results(selected_facets=["{v}_exact:{t}".format( v=vocab_key, t=term_id )])['count'], 1 )
def test_empty_term_name(self): """ Test that 'empty' which is used by Elasticsearch for missing terms won't mix with resources that are actually missing terms. """ # Create 'empty' term. Term is currently not assigned to anything. vocab_dict = dict(self.DEFAULT_VOCAB_DICT) vocab_dict['learning_resource_types'] = [ self.resource.learning_resource_type.name ] vocab_result = self.create_vocabulary(self.repo.slug, vocab_dict) vocab_slug = vocab_result['slug'] vocab_id = vocab_result['id'] vocab_key = make_vocab_key(vocab_id) term_result = self.create_term(self.repo.slug, vocab_slug, { "label": "empty", "weight": 4 }) term_id = term_result['id'] term_slug = term_result['slug'] # There is one resource and it is missing a term. # Test that a missing search will get one result. results = self.get_results( selected_facets=["_missing_:{v}_exact".format(v=vocab_key)]) self.assertEqual(results['count'], 1) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 1) # A search for 'empty' should get zero results. results = self.get_results( selected_facets=["{v}_exact:{t}".format(v=vocab_key, t=term_id)]) self.assertEqual(results['count'], 0) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 0) # Assign term to learning resource self.patch_learning_resource(self.repo.slug, self.resource.id, {"terms": [term_slug]}) # Do missing search again. results = self.get_results( selected_facets=["_missing_:{v}_exact".format(v=vocab_key)]) self.assertEqual(results['count'], 0) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 0) # Do search for term. results = self.get_results( selected_facets=["{v}_exact:{t}".format(v=vocab_key, t=term_id)]) self.assertEqual(results['count'], 1) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 0)
def test_empty_vocab_facet_count(self): """ Test that we get vocabulary label and not something else for an empty vocabulary. """ vocab_dict = dict(self.DEFAULT_VOCAB_DICT) name = 'name with spaces' vocab_dict['name'] = name vocab_result = self.create_vocabulary(self.repo.slug, vocab_dict) vocab_id = vocab_result['id'] vocab_key = make_vocab_key(vocab_id) self.assertEqual( self.get_results()['facet_counts'][vocab_key]['facet']['label'], name )
def ensure_vocabulary_mappings(term_info): """ Ensure the mapping is properly set in Elasticsearch to always do exact matches on taxonomy terms. Accepts the output of get_resource_terms. Calling this function during indexing means that vocabularies do not need to be added to the mapping in advance. This deals with the fact that vocabularies can be added on-the-fly without having to play around with extra signals. Args: term_info (dict): Details of terms for a group of LearningResources. """ if len(term_info) == 0: return get_conn() # We don't need the return value; just for it to exist. # Retrieve current mapping from Elasticsearch. mapping = Mapping.from_es(index=INDEX_NAME, doc_type=DOC_TYPE) # Get the field names from the mapping. existing_vocabs = set(mapping.to_dict()["learningresource"]["properties"]) # Get all the taxonomy names from the data. vocab_ids = set() for vocab_terms in term_info.values(): for vocab_id in vocab_terms.keys(): vocab_ids.add(vocab_id) updated = False # Add vocabulary to mapping if necessary. for vocab_id in vocab_ids: vocab_key = make_vocab_key(vocab_id) if vocab_key in existing_vocabs: continue mapping.field(vocab_key, "string", index="not_analyzed") updated = True if updated: mapping.save(INDEX_NAME) refresh_index()
def test_empty_term_name(self): """ Test that 'empty' which is used by Elasticsearch for missing terms won't mix with resources that are actually missing terms. """ # Create 'empty' term. Term is currently not assigned to anything. vocab_dict = dict(self.DEFAULT_VOCAB_DICT) vocab_dict['learning_resource_types'] = [ self.resource.learning_resource_type.name ] vocab_result = self.create_vocabulary(self.repo.slug, vocab_dict) vocab_slug = vocab_result['slug'] vocab_id = vocab_result['id'] vocab_key = make_vocab_key(vocab_id) term_result = self.create_term(self.repo.slug, vocab_slug, { "label": "empty", "weight": 4 }) term_id = term_result['id'] term_slug = term_result['slug'] # There is one resource and it is missing a term. # Test that a missing search will get one result. results = self.get_results( selected_facets=["_missing_:{v}_exact".format(v=vocab_key)] ) self.assertEqual(results['count'], 1) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 1) # A search for 'empty' should get zero results. results = self.get_results( selected_facets=["{v}_exact:{t}".format( v=vocab_key, t=term_id )] ) self.assertEqual(results['count'], 0) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 0) # Assign term to learning resource self.patch_learning_resource(self.repo.slug, self.resource.id, { "terms": [term_slug] }) # Do missing search again. results = self.get_results( selected_facets=["_missing_:{v}_exact".format(v=vocab_key)] ) self.assertEqual(results['count'], 0) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 0 ) # Do search for term. results = self.get_results( selected_facets=["{v}_exact:{t}".format( v=vocab_key, t=term_id )] ) self.assertEqual(results['count'], 1) self.assertEqual( results['facet_counts'][vocab_key]['facet']['missing_count'], 0 )
def convert_aggregate(agg): """ Convert elasticsearch-dsl output to the facet output currently being created from the Haystack data. Args: agg: Agg Returns: reformatted (dict): facet data """ special_labels = { 'run': 'Run', 'course': 'Course', 'resource_type': 'Item Type', } vocab_lookup = {} term_lookup = {} for term in Term.objects.all(): term_lookup[term.id] = term.label for vocab in Vocabulary.objects.all(): vocab_lookup[make_vocab_key(vocab.id)] = vocab.name def get_vocab_label(vocab_key): """Get label for vocab.""" return vocab_lookup.get(vocab_key, vocab_key) def get_term_label(term_id): """Get label for term.""" return term_lookup.get(int(term_id), str(term_id)) def get_builtin_label(key): """Get label for special types.""" return special_labels.get(key, key) # Group into fields. vocab_buckets = defaultdict(dict) builtin_buckets = defaultdict(dict) for key, value in agg['_d_'].items(): if key.endswith("_missing"): key = key[:-len("_missing")] vocab_buckets[key]['missing'] = value elif key.endswith("_buckets"): key = key[:-len("_buckets")] vocab_buckets[key]['buckets'] = value['buckets'] elif key.endswith("_builtins"): key = key[:-len("_builtins")] builtin_buckets[key]['buckets'] = value['buckets'] # No missing counts for run, course, resource_types. reformatted = {} for key, buckets_and_missing in vocab_buckets.items(): buckets = buckets_and_missing['buckets'] missing = buckets_and_missing['missing'] values = [{ "key": facet['key'], "label": get_term_label(facet['key']), "count": facet["doc_count"] } for facet in buckets] facet = { "key": key, "label": get_vocab_label(key), 'missing_count': missing['doc_count'] } reformatted[key] = { "facet": facet, "values": values, } for key, buckets_and_missing in builtin_buckets.items(): buckets = buckets_and_missing['buckets'] values = [{ "key": facet['key'], "label": facet['key'], "count": facet['doc_count'] } for facet in buckets] facet = { "key": key, "label": get_builtin_label(key), "missing_count": 0 } reformatted[key] = { "facet": facet, "values": values, } return reformatted
def resource_to_dict(resource, term_info): """ Retrieve important values from a LearningResource to index. This dict corresponds to the mapping created in Elasticsearch. The titlesort bits, with the "0" and "1" prefixes, were copied from the prepare_titlesort function in search/search_indexes.py. It was there to make blank titles sort to the bottom instead of the top. Args: resource (LearningResource): Item to convert to dict. term_info (dict): Vocabulary terms assigned to resource. Returns: rec (dict): Dictionary representation of the LearningResource. """ rec = { "title": resource.title, # The zero is for sorting blank items to the bottom. See below. "titlesort": "0{0}".format(resource.title.strip()), "id": resource.id, "_id": resource.id, # The ID used by Elasticsearch. "resource_type": resource.learning_resource_type.name, "description": resource.description, "description_path": resource.description_path, "content_xml": resource.content_xml, "content_stripped": strip_xml(resource.content_xml), "xa_nr_views": resource.xa_nr_views, "xa_nr_attempts": resource.xa_nr_attempts, "xa_avg_grade": resource.xa_avg_grade, "xa_histogram_grade": resource.xa_histogram_grade, } course = get_course_metadata(resource.course_id) rec["preview_url"] = get_preview_url( resource, org=course["org"], course_number=course["course_number"], run=course["run"], ) rec["run"] = course["run"] rec["course"] = course["course_number"] rec["repository"] = course["repo_slug"] # Index term info. Since these fields all use the "not_analyzed" # index, they must all be exact matches. for vocab_id, term_ids in term_info.items(): rec[make_vocab_key(vocab_id)] = term_ids # If the title is empty, sort it to the bottom. See above. if rec["titlesort"] == "0": rec["titlesort"] = "1" # Keys that may have unicode issues. text_keys = ( 'title', 'titlesort', 'resource_type', 'description', 'content_xml', 'content_stripped', 'description_path', ) for key in text_keys: try: # Thanks to unicode_literals above, this works in # Python 2 and Python 3. Avoid trying to decode a string # if it's already unicode. if not isinstance(rec[key], type("")): rec[key] = rec[key].decode('utf-8') except AttributeError: pass # Python 3 return rec
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): """ Perform a search in Elasticsearch. Args: tokens (unicode): string of one or more words repo_slug (unicode): repository slug sort_by (string): field to sort by terms: (dict): {"vocabulary name": ["term1" [, "term2"]]} Returns: results (SearchResults) """ if terms is None: terms = {} search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) # Limit returned fields since content_xml can be huge and is unnecessary. search = search.fields(_get_field_names()) if tokens is not None: # Search on title, description, and content_xml (minus markup). multi = query.MultiMatch( query=tokens, fields=["title", "description", "content_stripped"]) search = search.query(multi) # Filter further on taxonomy terms. for key, value in terms.items(): if value is None: search = search.query("query_string", query="_missing_:({key})".format(key=key)) else: search = search.query("match", **{key: value}) if repo_slug is not None: # Filter further on repository. search = search.query("match", repository=repo_slug) if sort_by is None: # Always sort by ID to preserve ordering. search = search.sort("id") else: # Temporary workaround; the values in sorting.py should be updated, # but for now Haystack is still using them. Also, the hyphen is # required because we sort the numeric values high to low. if "title" not in sort_by: reverse = sort_by.startswith("-") if reverse: sort_by = sort_by[1:] if "xa" not in sort_by: sort_by = "xa_{0}".format(sort_by) if reverse: sort_by = "-{0}".format(sort_by) # Always sort by ID to preserve ordering. search = search.sort(sort_by, "id") vocab_ids = set(get_vocab_ids(repo_slug=repo_slug)) for vocab_id in vocab_ids: vocab_key = make_vocab_key(vocab_id) search.aggs.bucket("{key}_missing".format(key=vocab_key), "missing", field=vocab_key) search.aggs.bucket("{key}_buckets".format(key=vocab_key), "terms", field=vocab_key) for key in ('run', 'course', 'resource_type'): search.aggs.bucket('{key}_builtins'.format(key=key), "terms", field=key) return SearchResults(search)
def convert_aggregate(agg): """ Convert elasticsearch-dsl output to the facet output currently being created from the Haystack data. Args: agg: Agg Returns: reformatted (dict): facet data """ special_labels = { 'run': 'Run', 'course': 'Course', 'resource_type': 'Item Type', } vocab_lookup = {} term_lookup = {} for term in Term.objects.all(): term_lookup[term.id] = term.label for vocab in Vocabulary.objects.all(): vocab_lookup[make_vocab_key(vocab.id)] = vocab.name def get_vocab_label(vocab_key): """Get label for vocab.""" return vocab_lookup.get(vocab_key, vocab_key) def get_term_label(term_id): """Get label for term.""" return term_lookup.get(int(term_id), str(term_id)) def get_builtin_label(key): """Get label for special types.""" return special_labels.get(key, key) # Group into fields. vocab_buckets = defaultdict(dict) builtin_buckets = defaultdict(dict) for key, value in agg['_d_'].items(): if key.endswith("_missing"): key = key[:-len("_missing")] vocab_buckets[key]['missing'] = value elif key.endswith("_buckets"): key = key[:-len("_buckets")] vocab_buckets[key]['buckets'] = value['buckets'] elif key.endswith("_builtins"): key = key[:-len("_builtins")] builtin_buckets[key]['buckets'] = value['buckets'] # No missing counts for run, course, resource_types. reformatted = {} for key, buckets_and_missing in vocab_buckets.items(): buckets = buckets_and_missing['buckets'] missing = buckets_and_missing['missing'] values = [ { "key": facet['key'], "label": get_term_label(facet['key']), "count": facet["doc_count"] } for facet in buckets ] facet = { "key": key, "label": get_vocab_label(key), 'missing_count': missing['doc_count'] } reformatted[key] = { "facet": facet, "values": values, } for key, buckets_and_missing in builtin_buckets.items(): buckets = buckets_and_missing['buckets'] values = [ { "key": facet['key'], "label": facet['key'], "count": facet['doc_count'] } for facet in buckets ] facet = { "key": key, "label": get_builtin_label(key), "missing_count": 0 } reformatted[key] = { "facet": facet, "values": values, } return reformatted
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None): """ Perform a search in Elasticsearch. Args: tokens (unicode): string of one or more words repo_slug (unicode): repository slug sort_by (string): field to sort by terms: (dict): {"vocabulary name": ["term1" [, "term2"]]} Returns: results (SearchResults) """ if terms is None: terms = {} search = Search(index=INDEX_NAME, doc_type=DOC_TYPE) # Limit returned fields since content_xml can be huge and is unnecessary. search = search.fields(_get_field_names()) if tokens is not None: # Search on title, description, and content_xml (minus markup). multi = query.MultiMatch( query=tokens, fields=["title", "description", "content_stripped"]) search = search.query(multi) # Filter further on taxonomy terms. for key, value in terms.items(): if value is None: search = search.query( "query_string", query="_missing_:({key})".format(key=key) ) else: search = search.query("match", **{key: value}) if repo_slug is not None: # Filter further on repository. search = search.query("match", repository=repo_slug) if sort_by is None: # Always sort by ID to preserve ordering. search = search.sort("id") else: # Temporary workaround; the values in sorting.py should be updated, # but for now Haystack is still using them. Also, the hyphen is # required because we sort the numeric values high to low. if "title" not in sort_by: reverse = sort_by.startswith("-") if reverse: sort_by = sort_by[1:] if "xa" not in sort_by: sort_by = "xa_{0}".format(sort_by) if reverse: sort_by = "-{0}".format(sort_by) # Always sort by ID to preserve ordering. search = search.sort(sort_by, "id") vocab_ids = set(get_vocab_ids(repo_slug=repo_slug)) for vocab_id in vocab_ids: vocab_key = make_vocab_key(vocab_id) search.aggs.bucket( "{key}_missing".format(key=vocab_key), "missing", field=vocab_key ) search.aggs.bucket( "{key}_buckets".format(key=vocab_key), "terms", field=vocab_key ) for key in ('run', 'course', 'resource_type'): search.aggs.bucket( '{key}_builtins'.format(key=key), "terms", field=key ) return SearchResults(search)
def count_faceted_results(self, vocab_id, term_id): """Return count of matching indexed records by facet.""" return search_index( repo_slug=self.repo.slug, terms={make_vocab_key(vocab_id): term_id} ).count()