コード例 #1
0
ファイル: test_vocabulary.py プロジェクト: olabi/lore
    def test_vocabulary_rename(self):
        """Test that index updates properly after a vocabulary rename."""
        vocab_result = self.create_vocabulary(self.repo.slug)
        vocab_id = vocab_result['id']
        vocab_key = make_vocab_key(vocab_id)
        vocab_slug = vocab_result['slug']
        term_result = self.create_term(self.repo.slug, vocab_slug)
        term_id = term_result['id']
        term_slug = term_result['slug']
        term_label = term_result['label']

        # Update vocabulary for resource type, assign term
        self.patch_vocabulary(self.repo.slug, vocab_slug, {
            "learning_resource_types": [
                self.resource.learning_resource_type.name
            ]
        })
        self.patch_learning_resource(self.repo.slug, self.resource.id, {
            "terms": [term_slug]
        })

        self.assertEqual(
            self.get_results()['facet_counts'][vocab_key]['facet']['label'],
            vocab_result['name']
        )
        self.assertEqual(
            self.get_results()['facet_counts'][vocab_key]['values'],
            [{'count': 1, 'key': str(term_id), 'label': term_label}]
        )
        self.assertEqual(
            self.get_results(selected_facets=["{v}_exact:{t}".format(
                v=vocab_key,
                t=term_id
            )])['count'],
            1
        )

        # Rename vocabulary
        name = "brand new name"
        self.patch_vocabulary(self.repo.slug, vocab_slug, {
            "name": name
        })

        # Facet counts should not change
        self.assertEqual(
            self.get_results()['facet_counts'][vocab_key]['facet']['label'],
            name
        )
        self.assertEqual(
            self.get_results()['facet_counts'][vocab_key]['values'],
            [{'count': 1, 'key': str(term_id), 'label': term_label}]
        )
        self.assertEqual(
            self.get_results(selected_facets=["{v}_exact:{t}".format(
                v=vocab_key,
                t=term_id
            )])['count'],
            1
        )
コード例 #2
0
ファイル: test_search.py プロジェクト: fatimahseraj/lore
    def test_empty_term_name(self):
        """
        Test that 'empty' which is used by Elasticsearch for missing terms
        won't mix with resources that are actually missing terms.
        """
        # Create 'empty' term. Term is currently not assigned to anything.
        vocab_dict = dict(self.DEFAULT_VOCAB_DICT)
        vocab_dict['learning_resource_types'] = [
            self.resource.learning_resource_type.name
        ]
        vocab_result = self.create_vocabulary(self.repo.slug, vocab_dict)
        vocab_slug = vocab_result['slug']
        vocab_id = vocab_result['id']
        vocab_key = make_vocab_key(vocab_id)

        term_result = self.create_term(self.repo.slug, vocab_slug, {
            "label": "empty",
            "weight": 4
        })
        term_id = term_result['id']
        term_slug = term_result['slug']

        # There is one resource and it is missing a term.
        # Test that a missing search will get one result.
        results = self.get_results(
            selected_facets=["_missing_:{v}_exact".format(v=vocab_key)])
        self.assertEqual(results['count'], 1)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'], 1)

        # A search for 'empty' should get zero results.
        results = self.get_results(
            selected_facets=["{v}_exact:{t}".format(v=vocab_key, t=term_id)])
        self.assertEqual(results['count'], 0)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'], 0)

        # Assign term to learning resource
        self.patch_learning_resource(self.repo.slug, self.resource.id,
                                     {"terms": [term_slug]})

        # Do missing search again.
        results = self.get_results(
            selected_facets=["_missing_:{v}_exact".format(v=vocab_key)])
        self.assertEqual(results['count'], 0)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'], 0)

        # Do search for term.
        results = self.get_results(
            selected_facets=["{v}_exact:{t}".format(v=vocab_key, t=term_id)])
        self.assertEqual(results['count'], 1)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'], 0)
コード例 #3
0
ファイル: test_vocabulary.py プロジェクト: olabi/lore
    def test_empty_vocab_facet_count(self):
        """
        Test that we get vocabulary label and not something else for
        an empty vocabulary.
        """
        vocab_dict = dict(self.DEFAULT_VOCAB_DICT)
        name = 'name with spaces'
        vocab_dict['name'] = name
        vocab_result = self.create_vocabulary(self.repo.slug, vocab_dict)
        vocab_id = vocab_result['id']
        vocab_key = make_vocab_key(vocab_id)

        self.assertEqual(
            self.get_results()['facet_counts'][vocab_key]['facet']['label'],
            name
        )
コード例 #4
0
def ensure_vocabulary_mappings(term_info):
    """
    Ensure the mapping is properly set in Elasticsearch to always do exact
    matches on taxonomy terms. Accepts the output of get_resource_terms.

    Calling this function during indexing means that vocabularies do not
    need to be added to the mapping in advance. This deals with the fact
    that vocabularies can be added on-the-fly without having to play around
    with extra signals.

    Args:
        term_info (dict): Details of terms for a group of LearningResources.
    """
    if len(term_info) == 0:
        return

    get_conn()  # We don't need the return value; just for it to exist.

    # Retrieve current mapping from Elasticsearch.
    mapping = Mapping.from_es(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Get the field names from the mapping.
    existing_vocabs = set(mapping.to_dict()["learningresource"]["properties"])

    # Get all the taxonomy names from the data.
    vocab_ids = set()
    for vocab_terms in term_info.values():
        for vocab_id in vocab_terms.keys():
            vocab_ids.add(vocab_id)
    updated = False
    # Add vocabulary to mapping if necessary.
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        if vocab_key in existing_vocabs:
            continue
        mapping.field(vocab_key, "string", index="not_analyzed")
        updated = True
    if updated:
        mapping.save(INDEX_NAME)
        refresh_index()
コード例 #5
0
ファイル: utils.py プロジェクト: olabi/lore
def ensure_vocabulary_mappings(term_info):
    """
    Ensure the mapping is properly set in Elasticsearch to always do exact
    matches on taxonomy terms. Accepts the output of get_resource_terms.

    Calling this function during indexing means that vocabularies do not
    need to be added to the mapping in advance. This deals with the fact
    that vocabularies can be added on-the-fly without having to play around
    with extra signals.

    Args:
        term_info (dict): Details of terms for a group of LearningResources.
    """
    if len(term_info) == 0:
        return

    get_conn()  # We don't need the return value; just for it to exist.

    # Retrieve current mapping from Elasticsearch.
    mapping = Mapping.from_es(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Get the field names from the mapping.
    existing_vocabs = set(mapping.to_dict()["learningresource"]["properties"])

    # Get all the taxonomy names from the data.
    vocab_ids = set()
    for vocab_terms in term_info.values():
        for vocab_id in vocab_terms.keys():
            vocab_ids.add(vocab_id)
    updated = False
    # Add vocabulary to mapping if necessary.
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        if vocab_key in existing_vocabs:
            continue
        mapping.field(vocab_key, "string", index="not_analyzed")
        updated = True
    if updated:
        mapping.save(INDEX_NAME)
        refresh_index()
コード例 #6
0
ファイル: test_search.py プロジェクト: mitodl/lore
    def test_empty_term_name(self):
        """
        Test that 'empty' which is used by Elasticsearch for missing terms
        won't mix with resources that are actually missing terms.
        """
        # Create 'empty' term. Term is currently not assigned to anything.
        vocab_dict = dict(self.DEFAULT_VOCAB_DICT)
        vocab_dict['learning_resource_types'] = [
            self.resource.learning_resource_type.name
        ]
        vocab_result = self.create_vocabulary(self.repo.slug, vocab_dict)
        vocab_slug = vocab_result['slug']
        vocab_id = vocab_result['id']
        vocab_key = make_vocab_key(vocab_id)

        term_result = self.create_term(self.repo.slug, vocab_slug, {
            "label": "empty",
            "weight": 4
        })
        term_id = term_result['id']
        term_slug = term_result['slug']

        # There is one resource and it is missing a term.
        # Test that a missing search will get one result.
        results = self.get_results(
            selected_facets=["_missing_:{v}_exact".format(v=vocab_key)]
        )
        self.assertEqual(results['count'], 1)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'], 1)

        # A search for 'empty' should get zero results.
        results = self.get_results(
            selected_facets=["{v}_exact:{t}".format(
                v=vocab_key,
                t=term_id
            )]
        )
        self.assertEqual(results['count'], 0)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'], 0)

        # Assign term to learning resource
        self.patch_learning_resource(self.repo.slug, self.resource.id, {
            "terms": [term_slug]
        })

        # Do missing search again.
        results = self.get_results(
            selected_facets=["_missing_:{v}_exact".format(v=vocab_key)]
        )
        self.assertEqual(results['count'], 0)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'],
            0
        )

        # Do search for term.
        results = self.get_results(
            selected_facets=["{v}_exact:{t}".format(
                v=vocab_key,
                t=term_id
            )]
        )
        self.assertEqual(results['count'], 1)
        self.assertEqual(
            results['facet_counts'][vocab_key]['facet']['missing_count'],
            0
        )
コード例 #7
0
def convert_aggregate(agg):
    """
    Convert elasticsearch-dsl output to the facet output
    currently being created from the Haystack data.
    Args:
        agg: Agg
    Returns:
        reformatted (dict): facet data
    """

    special_labels = {
        'run': 'Run',
        'course': 'Course',
        'resource_type': 'Item Type',
    }

    vocab_lookup = {}
    term_lookup = {}
    for term in Term.objects.all():
        term_lookup[term.id] = term.label

    for vocab in Vocabulary.objects.all():
        vocab_lookup[make_vocab_key(vocab.id)] = vocab.name

    def get_vocab_label(vocab_key):
        """Get label for vocab."""
        return vocab_lookup.get(vocab_key, vocab_key)

    def get_term_label(term_id):
        """Get label for term."""
        return term_lookup.get(int(term_id), str(term_id))

    def get_builtin_label(key):
        """Get label for special types."""
        return special_labels.get(key, key)

    # Group into fields.
    vocab_buckets = defaultdict(dict)
    builtin_buckets = defaultdict(dict)
    for key, value in agg['_d_'].items():
        if key.endswith("_missing"):
            key = key[:-len("_missing")]
            vocab_buckets[key]['missing'] = value
        elif key.endswith("_buckets"):
            key = key[:-len("_buckets")]
            vocab_buckets[key]['buckets'] = value['buckets']
        elif key.endswith("_builtins"):
            key = key[:-len("_builtins")]
            builtin_buckets[key]['buckets'] = value['buckets']
            # No missing counts for run, course, resource_types.

    reformatted = {}
    for key, buckets_and_missing in vocab_buckets.items():
        buckets = buckets_and_missing['buckets']
        missing = buckets_and_missing['missing']

        values = [{
            "key": facet['key'],
            "label": get_term_label(facet['key']),
            "count": facet["doc_count"]
        } for facet in buckets]
        facet = {
            "key": key,
            "label": get_vocab_label(key),
            'missing_count': missing['doc_count']
        }

        reformatted[key] = {
            "facet": facet,
            "values": values,
        }

    for key, buckets_and_missing in builtin_buckets.items():
        buckets = buckets_and_missing['buckets']

        values = [{
            "key": facet['key'],
            "label": facet['key'],
            "count": facet['doc_count']
        } for facet in buckets]
        facet = {
            "key": key,
            "label": get_builtin_label(key),
            "missing_count": 0
        }

        reformatted[key] = {
            "facet": facet,
            "values": values,
        }

    return reformatted
コード例 #8
0
def resource_to_dict(resource, term_info):
    """
    Retrieve important values from a LearningResource to index.

    This dict corresponds to the mapping created in Elasticsearch.

    The titlesort bits, with the "0" and "1" prefixes, were copied from
    the prepare_titlesort function in search/search_indexes.py. It was there
    to make blank titles sort to the bottom instead of the top.

    Args:
        resource (LearningResource): Item to convert to dict.
        term_info (dict): Vocabulary terms assigned to resource.
    Returns:
        rec (dict): Dictionary representation of the LearningResource.
    """

    rec = {
        "title": resource.title,
        # The zero is for sorting blank items to the bottom. See below.
        "titlesort": "0{0}".format(resource.title.strip()),
        "id": resource.id,
        "_id": resource.id,  # The ID used by Elasticsearch.
        "resource_type": resource.learning_resource_type.name,
        "description": resource.description,
        "description_path": resource.description_path,
        "content_xml": resource.content_xml,
        "content_stripped": strip_xml(resource.content_xml),
        "xa_nr_views": resource.xa_nr_views,
        "xa_nr_attempts": resource.xa_nr_attempts,
        "xa_avg_grade": resource.xa_avg_grade,
        "xa_histogram_grade": resource.xa_histogram_grade,
    }

    course = get_course_metadata(resource.course_id)
    rec["preview_url"] = get_preview_url(
        resource,
        org=course["org"],
        course_number=course["course_number"],
        run=course["run"],
    )
    rec["run"] = course["run"]
    rec["course"] = course["course_number"]
    rec["repository"] = course["repo_slug"]

    # Index term info. Since these fields all use the "not_analyzed"
    # index, they must all be exact matches.
    for vocab_id, term_ids in term_info.items():
        rec[make_vocab_key(vocab_id)] = term_ids

    # If the title is empty, sort it to the bottom. See above.
    if rec["titlesort"] == "0":
        rec["titlesort"] = "1"

    # Keys that may have unicode issues.
    text_keys = (
        'title',
        'titlesort',
        'resource_type',
        'description',
        'content_xml',
        'content_stripped',
        'description_path',
    )
    for key in text_keys:
        try:
            # Thanks to unicode_literals above, this works in
            # Python 2 and Python 3. Avoid trying to decode a string
            # if it's already unicode.
            if not isinstance(rec[key], type("")):
                rec[key] = rec[key].decode('utf-8')
        except AttributeError:
            pass  # Python 3

    return rec
コード例 #9
0
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None):
    """
    Perform a search in Elasticsearch.

    Args:
        tokens (unicode): string of one or more words
        repo_slug (unicode): repository slug
        sort_by (string): field to sort by
        terms: (dict): {"vocabulary name": ["term1" [, "term2"]]}
    Returns:
        results (SearchResults)
    """
    if terms is None:
        terms = {}

    search = Search(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Limit returned fields since content_xml can be huge and is unnecessary.
    search = search.fields(_get_field_names())

    if tokens is not None:
        # Search on title, description, and content_xml (minus markup).
        multi = query.MultiMatch(
            query=tokens, fields=["title", "description", "content_stripped"])
        search = search.query(multi)

    # Filter further on taxonomy terms.
    for key, value in terms.items():
        if value is None:
            search = search.query("query_string",
                                  query="_missing_:({key})".format(key=key))
        else:
            search = search.query("match", **{key: value})

    if repo_slug is not None:
        # Filter further on repository.
        search = search.query("match", repository=repo_slug)
    if sort_by is None:
        # Always sort by ID to preserve ordering.
        search = search.sort("id")
    else:
        # Temporary workaround; the values in sorting.py should be updated,
        # but for now Haystack is still using them. Also, the hyphen is
        # required because we sort the numeric values high to low.
        if "title" not in sort_by:
            reverse = sort_by.startswith("-")
            if reverse:
                sort_by = sort_by[1:]
            if "xa" not in sort_by:
                sort_by = "xa_{0}".format(sort_by)
            if reverse:
                sort_by = "-{0}".format(sort_by)
        # Always sort by ID to preserve ordering.
        search = search.sort(sort_by, "id")

    vocab_ids = set(get_vocab_ids(repo_slug=repo_slug))
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        search.aggs.bucket("{key}_missing".format(key=vocab_key),
                           "missing",
                           field=vocab_key)
        search.aggs.bucket("{key}_buckets".format(key=vocab_key),
                           "terms",
                           field=vocab_key)
    for key in ('run', 'course', 'resource_type'):
        search.aggs.bucket('{key}_builtins'.format(key=key),
                           "terms",
                           field=key)

    return SearchResults(search)
コード例 #10
0
ファイル: utils.py プロジェクト: olabi/lore
def convert_aggregate(agg):
    """
    Convert elasticsearch-dsl output to the facet output
    currently being created from the Haystack data.
    Args:
        agg: Agg
    Returns:
        reformatted (dict): facet data
    """

    special_labels = {
        'run': 'Run',
        'course': 'Course',
        'resource_type': 'Item Type',
    }

    vocab_lookup = {}
    term_lookup = {}
    for term in Term.objects.all():
        term_lookup[term.id] = term.label

    for vocab in Vocabulary.objects.all():
        vocab_lookup[make_vocab_key(vocab.id)] = vocab.name

    def get_vocab_label(vocab_key):
        """Get label for vocab."""
        return vocab_lookup.get(vocab_key, vocab_key)

    def get_term_label(term_id):
        """Get label for term."""
        return term_lookup.get(int(term_id), str(term_id))

    def get_builtin_label(key):
        """Get label for special types."""
        return special_labels.get(key, key)

    # Group into fields.
    vocab_buckets = defaultdict(dict)
    builtin_buckets = defaultdict(dict)
    for key, value in agg['_d_'].items():
        if key.endswith("_missing"):
            key = key[:-len("_missing")]
            vocab_buckets[key]['missing'] = value
        elif key.endswith("_buckets"):
            key = key[:-len("_buckets")]
            vocab_buckets[key]['buckets'] = value['buckets']
        elif key.endswith("_builtins"):
            key = key[:-len("_builtins")]
            builtin_buckets[key]['buckets'] = value['buckets']
            # No missing counts for run, course, resource_types.

    reformatted = {}
    for key, buckets_and_missing in vocab_buckets.items():
        buckets = buckets_and_missing['buckets']
        missing = buckets_and_missing['missing']

        values = [
            {
                "key": facet['key'],
                "label": get_term_label(facet['key']),
                "count": facet["doc_count"]
            } for facet in buckets
        ]
        facet = {
            "key": key,
            "label": get_vocab_label(key),
            'missing_count': missing['doc_count']
        }

        reformatted[key] = {
            "facet": facet,
            "values": values,
        }

    for key, buckets_and_missing in builtin_buckets.items():
        buckets = buckets_and_missing['buckets']

        values = [
            {
                "key": facet['key'],
                "label": facet['key'],
                "count": facet['doc_count']
            } for facet in buckets
        ]
        facet = {
            "key": key,
            "label": get_builtin_label(key),
            "missing_count": 0
        }

        reformatted[key] = {
            "facet": facet,
            "values": values,
        }

    return reformatted
コード例 #11
0
ファイル: utils.py プロジェクト: olabi/lore
def resource_to_dict(resource, term_info):
    """
    Retrieve important values from a LearningResource to index.

    This dict corresponds to the mapping created in Elasticsearch.

    The titlesort bits, with the "0" and "1" prefixes, were copied from
    the prepare_titlesort function in search/search_indexes.py. It was there
    to make blank titles sort to the bottom instead of the top.

    Args:
        resource (LearningResource): Item to convert to dict.
        term_info (dict): Vocabulary terms assigned to resource.
    Returns:
        rec (dict): Dictionary representation of the LearningResource.
    """

    rec = {
        "title": resource.title,
        # The zero is for sorting blank items to the bottom. See below.
        "titlesort": "0{0}".format(resource.title.strip()),
        "id": resource.id,
        "_id": resource.id,  # The ID used by Elasticsearch.
        "resource_type": resource.learning_resource_type.name,
        "description": resource.description,
        "description_path": resource.description_path,
        "content_xml": resource.content_xml,
        "content_stripped": strip_xml(resource.content_xml),
        "xa_nr_views": resource.xa_nr_views,
        "xa_nr_attempts": resource.xa_nr_attempts,
        "xa_avg_grade": resource.xa_avg_grade,
        "xa_histogram_grade": resource.xa_histogram_grade,
    }

    course = get_course_metadata(resource.course_id)
    rec["preview_url"] = get_preview_url(
        resource,
        org=course["org"],
        course_number=course["course_number"],
        run=course["run"],
    )
    rec["run"] = course["run"]
    rec["course"] = course["course_number"]
    rec["repository"] = course["repo_slug"]

    # Index term info. Since these fields all use the "not_analyzed"
    # index, they must all be exact matches.
    for vocab_id, term_ids in term_info.items():
        rec[make_vocab_key(vocab_id)] = term_ids

    # If the title is empty, sort it to the bottom. See above.
    if rec["titlesort"] == "0":
        rec["titlesort"] = "1"

    # Keys that may have unicode issues.
    text_keys = (
        'title', 'titlesort', 'resource_type', 'description', 'content_xml',
        'content_stripped', 'description_path',
    )
    for key in text_keys:
        try:
            # Thanks to unicode_literals above, this works in
            # Python 2 and Python 3. Avoid trying to decode a string
            # if it's already unicode.
            if not isinstance(rec[key], type("")):
                rec[key] = rec[key].decode('utf-8')
        except AttributeError:
            pass  # Python 3

    return rec
コード例 #12
0
ファイル: utils.py プロジェクト: olabi/lore
def search_index(tokens=None, repo_slug=None, sort_by=None, terms=None):
    """
    Perform a search in Elasticsearch.

    Args:
        tokens (unicode): string of one or more words
        repo_slug (unicode): repository slug
        sort_by (string): field to sort by
        terms: (dict): {"vocabulary name": ["term1" [, "term2"]]}
    Returns:
        results (SearchResults)
    """
    if terms is None:
        terms = {}

    search = Search(index=INDEX_NAME, doc_type=DOC_TYPE)

    # Limit returned fields since content_xml can be huge and is unnecessary.
    search = search.fields(_get_field_names())

    if tokens is not None:
        # Search on title, description, and content_xml (minus markup).
        multi = query.MultiMatch(
            query=tokens, fields=["title", "description", "content_stripped"])
        search = search.query(multi)

    # Filter further on taxonomy terms.
    for key, value in terms.items():
        if value is None:
            search = search.query(
                "query_string",
                query="_missing_:({key})".format(key=key)
            )
        else:
            search = search.query("match", **{key: value})

    if repo_slug is not None:
        # Filter further on repository.
        search = search.query("match", repository=repo_slug)
    if sort_by is None:
        # Always sort by ID to preserve ordering.
        search = search.sort("id")
    else:
        # Temporary workaround; the values in sorting.py should be updated,
        # but for now Haystack is still using them. Also, the hyphen is
        # required because we sort the numeric values high to low.
        if "title" not in sort_by:
            reverse = sort_by.startswith("-")
            if reverse:
                sort_by = sort_by[1:]
            if "xa" not in sort_by:
                sort_by = "xa_{0}".format(sort_by)
            if reverse:
                sort_by = "-{0}".format(sort_by)
        # Always sort by ID to preserve ordering.
        search = search.sort(sort_by, "id")

    vocab_ids = set(get_vocab_ids(repo_slug=repo_slug))
    for vocab_id in vocab_ids:
        vocab_key = make_vocab_key(vocab_id)
        search.aggs.bucket(
            "{key}_missing".format(key=vocab_key),
            "missing", field=vocab_key
        )
        search.aggs.bucket(
            "{key}_buckets".format(key=vocab_key),
            "terms", field=vocab_key
        )
    for key in ('run', 'course', 'resource_type'):
        search.aggs.bucket(
            '{key}_builtins'.format(key=key), "terms", field=key
        )

    return SearchResults(search)
コード例 #13
0
ファイル: base_es.py プロジェクト: dreganism/lore
 def count_faceted_results(self, vocab_id, term_id):
     """Return count of matching indexed records by facet."""
     return search_index(
         repo_slug=self.repo.slug,
         terms={make_vocab_key(vocab_id): term_id}
     ).count()