Esempio n. 1
0
    def test_doc_search_pagination(self, api_client, project):
        """Test Doc search result can be paginated"""
        latest_version = project.versions.all()[0]
        html_file = HTMLFile.objects.filter(version=latest_version)[0]
        title = html_file.processed_json['title']
        query = title.split()[0]

        # Create 60 more same html file
        for _ in range(60):
            # Make primary key to None, so django will create new object
            html_file.pk = None
            html_file.save()
            PageDocument().update(html_file)

        search_params = {
            'q': query,
            'project': project.slug,
            'version': latest_version.slug
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        # Check the count is 61 (1 existing and 60 new created)
        assert resp.data['count'] == 61
        # Check there are next url
        assert resp.data['next'] is not None
        # There should be only 50 data as the pagination is 50 by default
        assert len(resp.data['results']) == 50

        # Add `page_size` parameter and check the data is paginated accordingly
        search_params['page_size'] = 5
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        assert len(resp.data['results']) == 5
Esempio n. 2
0
    def test_facted_page_xss(self, client, project):
        query = 'XSS'
        page_search = PageDocument.faceted_search(query=query, user='')
        results = page_search.execute()
        expected = """
        &lt;h3&gt;<span>XSS</span> exploit&lt;&#x2F;h3&gt;
        """.strip()

        hits = results.hits.hits
        assert len(hits) == 1  # there should be only one result

        inner_hits = hits[0]['inner_hits']

        domain_hits = inner_hits['domains']['hits']['hits']
        assert len(
            domain_hits) == 0  # there shouldn't be any results from domains

        section_hits = inner_hits['sections']['hits']['hits']
        assert len(section_hits) == 1

        section_content_highlight = section_hits[0]['highlight'][
            'sections.content']
        assert len(section_content_highlight) == 1

        assert expected in section_content_highlight[0]
Esempio n. 3
0
def elastic_search(request):
    """Use Elasticsearch for global search."""
    user_input = UserInput(
        query=request.GET.get('q'),
        type=request.GET.get('type', 'project'),
        project=request.GET.get('project'),
        version=request.GET.get('version', LATEST),
        taxonomy=request.GET.get('taxonomy'),
        language=request.GET.get('language'),
    )
    results = ''

    facets = {}

    if user_input.query:
        if user_input.type == 'project':
            project_search = ProjectDocument.faceted_search(
                query=user_input.query, language=user_input.language)
            results = project_search.execute()
            facets = results.facets
        elif user_input.type == 'file':
            kwargs = {}
            if user_input.project:
                projects_list = get_project_list_or_404(
                    project_slug=user_input.project, user=request.user)
                project_slug_list = [project.slug for project in projects_list]
                kwargs['projects_list'] = project_slug_list
            if user_input.version:
                kwargs['versions_list'] = user_input.version

            page_search = PageDocument.faceted_search(query=user_input.query,
                                                      **kwargs)
            results = page_search.execute()
            facets = results.facets

    if settings.DEBUG:
        print(pprint(results))
        print(pprint(facets))

    if user_input.query:
        user = ''
        if request.user.is_authenticated():
            user = request.user
        log.info(
            LOG_TEMPLATE.format(
                user=user,
                project=user_input.project or '',
                type=user_input.type or '',
                version=user_input.version or '',
                language=user_input.language or '',
                msg=user_input.query or '',
            ))

    template_vars = user_input._asdict()
    template_vars.update({'results': results, 'facets': facets})
    return render(
        request,
        'search/elastic_search.html',
        template_vars,
    )
Esempio n. 4
0
    def test_doc_search_filter_by_version(self, api_client, project):
        """Test Doc search result are filtered according to version"""
        query = get_search_query_from_project_file(project_slug=project.slug)
        latest_version = project.versions.all()[0]
        # Create another version
        dummy_version = G(
            Version,
            project=project,
            active=True,
            privacy_level=PUBLIC,
        )
        # Create HTMLFile same as the latest version
        latest_version_files = HTMLFile.objects.all().filter(
            version=latest_version)
        for f in latest_version_files:
            f.version = dummy_version
            # Make primary key to None, so django will create new object
            f.pk = None
            f.save()
            PageDocument().update(f)

        search_params = {
            'q': query,
            'project': project.slug,
            'version': dummy_version.slug
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        data = resp.data['results']
        assert len(data) == 1
        assert data[0]['project'] == project.slug
Esempio n. 5
0
 def test_facted_page_xss(self, client, project):
     query = 'XSS'
     page_search = PageDocument.faceted_search(query=query, user='')
     results = page_search.execute()
     expected = """
     &lt;h3&gt;<em>XSS</em> exploit&lt;&#x2F;h3&gt;
     """.strip()
     assert results[0].meta.highlight.content[0][:len(expected)] == expected
Esempio n. 6
0
 def test_facted_page_xss(self, client, project):
     query = 'XSS'
     page_search = PageDocument.faceted_search(query=query, user='')
     results = page_search.execute()
     expected = """
     &lt;h3&gt;<em>XSS</em> exploit&lt;&#x2F;h3&gt;
     """.strip()
     assert results[0].meta.highlight.content[0][:len(expected)] == expected
Esempio n. 7
0
def all_projects(es_index, mock_processed_json, db, settings):
    settings.ELASTICSEARCH_DSL_AUTOSYNC = True
    projects_list = []
    for project_slug in ALL_PROJECTS:
        project = get(
            Project,
            slug=project_slug,
            name=project_slug,
            main_language_project=None,
            privacy_level=PUBLIC,
        )
        project.versions.update(
            privacy_level=PUBLIC,
            built=True,
            active=True,
        )

        for file_basename in PROJECT_DATA_FILES[project.slug]:
            # file_basename in config are without extension so add html extension
            file_name = file_basename + '.html'
            version = project.versions.all()[0]
            html_file = get(
                HTMLFile,
                project=project,
                version=version,
                name=file_name,
                path=file_name,
                build=1,
            )

            # creating sphinx domain test objects
            file_path = get_json_file_path(project.slug, file_basename)
            if os.path.exists(file_path):
                with open(file_path) as f:
                    data = json.load(f)
                    domains = data['domains']

                    for domain_data in domains:
                        domain_role_name = domain_data.pop('role_name')
                        domain, type_ = domain_role_name.split(':')

                        get(SphinxDomain,
                            project=project,
                            version=version,
                            html_file=html_file,
                            domain=domain,
                            type=type_,
                            **domain_data)

            PageDocument().update(html_file)

        projects_list.append(project)

    shuffle(projects_list)
    return projects_list
Esempio n. 8
0
    def get_queryset(self):
        """
        Return Elasticsearch DSL Search object instead of Django Queryset.

        Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
        So for searching, its possible to return ``Search`` object instead of queryset.
        The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
        """
        # Validate all the required params are there
        self.validate_query_params()
        query = self.request.query_params.get('query', '')
        queryset = PageDocument.simple_search(query=query)
        return queryset
Esempio n. 9
0
    def test_doc_search_subprojects_default_version(self, api_client,
                                                    all_projects):
        """Return results from subprojects that match the version from the main project or fallback to its default version."""
        project = all_projects[0]
        version = project.versions.all()[0]
        feature, _ = Feature.objects.get_or_create(
            feature_id=Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION, )
        project.feature_set.add(feature)

        subproject = all_projects[1]
        subproject_version = subproject.versions.all()[0]

        # Change the name of the version, and make it default.
        subproject_version.slug = 'different'
        subproject_version.save()
        subproject.default_version = subproject_version.slug
        subproject.save()
        subproject.versions.filter(slug=version.slug).delete()

        # Refresh index
        version_files = HTMLFile.objects.all().filter(
            version=subproject_version)
        for f in version_files:
            PageDocument().update(f)

        # Add another project as subproject of the project
        project.add_subproject(subproject)

        # Now search with subproject content but explicitly filter by the parent project
        query = get_search_query_from_project_file(
            project_slug=subproject.slug)
        search_params = {
            'q': query,
            'project': project.slug,
            'version': version.slug
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        data = resp.data['results']
        assert len(data) >= 1  # there may be results from another projects

        # First result should be the subproject
        first_result = data[0]
        assert first_result['project'] == subproject.slug
        assert first_result['version'] == 'different'
        # Check the link is the subproject document link
        document_link = subproject.get_docs_url(
            version_slug=subproject_version.slug)
        link = first_result['domain'] + first_result['path']
        assert document_link in link
Esempio n. 10
0
    def get_queryset(self):
        """
        Return Elasticsearch DSL Search object instead of Django Queryset.

        Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern.
        So for searching, its possible to return ``Search`` object instead of queryset.
        The ``filter_backends`` and ``pagination_class`` is compatible with ``Search``
        """
        # Validate all the required params are there
        self.validate_query_params()
        query = self.request.query_params.get('q', '')
        kwargs = {'filter_by_user': False}
        kwargs['projects_list'] = [p.slug for p in self.get_all_projects()]
        kwargs['versions_list'] = self.request.query_params.get('version')
        user = self.request.user
        queryset = PageDocument.faceted_search(query=query,
                                               user=user,
                                               **kwargs)
        return queryset
Esempio n. 11
0
    def test_search_exact_match(self, client, project, case):
        """Check quoted query match exact phrase with case insensitively

        Making a query with quoted text like ``"foo bar"`` should match
        exactly ``foo bar`` or ``Foo Bar`` etc
        """
        # `Github` word is present both in `kuma` and `pipeline` files
        # But the phrase Github can is available only in kuma docs.
        # So search with this phrase to check
        query_text = r'"GitHub can"'
        cased_query = getattr(query_text, case)
        query = cased_query()

        page_search = PageDocument.faceted_search(query=query, user='')
        results = page_search.execute()

        assert len(results) == 1
        assert results[0]['project'] == 'kuma'
        assert results[0]['path'] == 'documentation'
    def test_search_exact_match(self, client, project, case):
        """Check quoted query match exact phrase with case insensitively

        Making a query with quoted text like ``"foo bar"`` should match
        exactly ``foo bar`` or ``Foo Bar`` etc
        """
        # `Github` word is present both in `kuma` and `pipeline` files
        # But the phrase Github can is available only in kuma docs.
        # So search with this phrase to check
        query_text = r'"GitHub can"'
        cased_query = getattr(query_text, case)
        query = cased_query()

        page_search = PageDocument.faceted_search(query=query, user='')
        results = page_search.execute()

        assert len(results) == 1
        assert results[0]['project'] == 'kuma'
        assert results[0]['path'] == 'documentation'
Esempio n. 13
0
    def test_search_combined_result(self, client, project):
        """Check search result are combined of both `AND` and `OR` operator

        If query is `Foo Bar` then the result should be as following order:

        - Where both `Foo Bar` is present
        - Where `Foo` or `Bar` is present
        """
        query = 'Official Support'
        page_search = PageDocument.faceted_search(query=query, user='')
        results = page_search.execute()
        assert len(results) == 3

        result_paths = [r.path for r in results]
        # ``open-source-philosophy`` page has both ``Official Support`` words
        # ``docker`` page has ``Support`` word
        # ``installation`` page has ``Official`` word
        expected_paths = ['open-source-philosophy', 'docker', 'installation']

        assert result_paths == expected_paths
Esempio n. 14
0
def all_projects(es_index, mock_processed_json, db, settings):
    settings.ELASTICSEARCH_DSL_AUTOSYNC = True
    projects_list = []
    for project_slug in ALL_PROJECTS:
        project = G(Project, slug=project_slug, name=project_slug)

        for file_basename in PROJECT_DATA_FILES[project.slug]:
            # file_basename in config are without extension so add html extension
            file_name = file_basename + '.html'
            version = project.versions.all()[0]
            html_file = G(HTMLFile,
                          project=project,
                          version=version,
                          name=file_name)
            PageDocument().update(html_file)

        projects_list.append(project)

    shuffle(projects_list)
    return projects_list
    def test_search_combined_result(self, client, project):
        """Check search result are combined of both `AND` and `OR` operator

        If query is `Foo Bar` then the result should be as following order:

        - Where both `Foo Bar` is present
        - Where `Foo` or `Bar` is present
        """
        query = 'Official Support'
        page_search = PageDocument.faceted_search(query=query, user='')
        results = page_search.execute()
        assert len(results) == 3

        result_paths = [r.path for r in results]
        # ``open-source-philosophy`` page has both ``Official Support`` words
        # ``docker`` page has ``Support`` word
        # ``installation`` page has ``Official`` word
        expected_paths = ['open-source-philosophy', 'docker', 'installation']

        assert result_paths == expected_paths
Esempio n. 16
0
    def test_search_combined_result(self, client, project):
        """Check search result are combined of both `AND` and `OR` operator

        If query is `Foo Bar` then the result should be as following order:

        - Where both `Foo Bar` is present
        - Where `Foo` or `Bar` is present
        """
        query = 'Elasticsearch Query'
        page_search = PageDocument.faceted_search(query=query, user='')
        results = page_search.execute()
        assert len(results) == 3

        result_paths = [r.path for r in results]
        # ``guides/wipe-environment`` page has both ``Elasticsearch Query`` words
        # ``docker`` page has ``Elasticsearch`` word
        # ``installation`` page has ``Query`` word.
        expected_paths = ['guides/wipe-environment', 'docker', 'installation']

        assert result_paths == expected_paths
Esempio n. 17
0
    def test_search_correct_link_for_index_page_subdirectory_htmldir_projects(self, api_client, doctype):
        project = Project.objects.get(slug='docs')
        project.versions.update(documentation_type=doctype)
        version = project.versions.all().first()

        # Refresh index
        version_files = HTMLFile.objects.all().filter(version=version)
        for f in version_files:
            PageDocument().update(f)

        search_params = {
            'project': project.slug,
            'version': version.slug,
            'q': 'Some content from guides/index',
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        result = resp.data['results'][0]
        assert result['project'] == project.slug
        assert result['path'] == '/en/latest/guides/'
Esempio n. 18
0
def elastic_project_search(request, project_slug):
    """Use elastic search to search in a project."""
    queryset = Project.objects.protected(request.user)
    project = get_object_or_404(queryset, slug=project_slug)
    version_slug = request.GET.get('version', LATEST)
    query = request.GET.get('q', None)
    results = None
    if query:
        user = ''
        if request.user.is_authenticated:
            user = request.user
        log.info(
            LOG_TEMPLATE.format(
                user=user,
                project=project or '',
                type='inproject',
                version=version_slug or '',
                language='',
                msg=query or '',
            ),
        )

    if query:
        req = PageDocument.simple_search(query=query)
        filtered_query = (
            req.filter('term', project=project.slug)
            .filter('term', version=version_slug)
        )
        paginated_query = filtered_query[:50]
        results = paginated_query.execute()

    return render(
        request,
        'search/elastic_project_search.html',
        {
            'project': project,
            'query': query,
            'results': results,
        },
    )
Esempio n. 19
0
def elastic_project_search(request, project_slug):
    """Use elastic search to search in a project."""
    queryset = Project.objects.protected(request.user)
    project = get_object_or_404(queryset, slug=project_slug)
    version_slug = request.GET.get('version', LATEST)
    query = request.GET.get('q', None)
    results = None

    if query:
        kwargs = {}
        kwargs['projects_list'] = [project.slug]
        kwargs['versions_list'] = version_slug

        page_search = PageDocument.faceted_search(query=query,
                                                  user=request.user,
                                                  **kwargs)
        results = page_search.execute()

        log.debug('Search results: %s', pformat(results.to_dict()))
        log.debug('Search facets: %s', pformat(results.facets.to_dict()))

        log.info(
            LOG_TEMPLATE.format(
                user=request.user,
                project=project or '',
                type='inproject',
                version=version_slug or '',
                language='',
                msg=query or '',
            ), )

    return render(
        request,
        'search/elastic_project_search.html',
        {
            'project': project,
            'query': query,
            'results': results,
        },
    )
Esempio n. 20
0
    def test_search_custom_ranking(self, api_client):
        project = Project.objects.get(slug='docs')
        version = project.versions.all().first()

        page_index = HTMLFile.objects.get(path='index.html')
        page_guides = HTMLFile.objects.get(path='guides/index.html')

        # Query with the default ranking
        assert page_index.rank == 0
        assert page_guides.rank == 0

        search_params = {
            'project': project.slug,
            'version': version.slug,
            'q': '"content from"',
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        results = resp.data['results']
        assert len(results) == 2
        assert results[0]['path'] == '/en/latest/index.html'
        assert results[1]['path'] == '/en/latest/guides/index.html'

        # Query with a higher rank over guides/index.html
        page_guides.rank = 5
        page_guides.save()
        PageDocument().update(page_guides)

        search_params = {
            'project': project.slug,
            'version': version.slug,
            'q': '"content from"',
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        results = resp.data['results']
        assert len(results) == 2
        assert results[0]['path'] == '/en/latest/guides/index.html'
        assert results[1]['path'] == '/en/latest/index.html'

        # Query with a lower rank over index.html
        page_index.rank = -2
        page_index.save()
        page_guides.rank = 4
        page_guides.save()
        PageDocument().update(page_index)
        PageDocument().update(page_guides)

        search_params = {
            'project': project.slug,
            'version': version.slug,
            'q': '"content from"',
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        results = resp.data['results']
        assert len(results) == 2
        assert results[0]['path'] == '/en/latest/guides/index.html'
        assert results[1]['path'] == '/en/latest/index.html'

        # Query with a lower rank over index.html
        page_index.rank = 3
        page_index.save()
        page_guides.rank = 6
        page_guides.save()
        PageDocument().update(page_index)
        PageDocument().update(page_guides)

        search_params = {
            'project': project.slug,
            'version': version.slug,
            'q': '"content from"',
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        results = resp.data['results']
        assert len(results) == 2
        assert results[0]['path'] == '/en/latest/guides/index.html'
        assert results[1]['path'] == '/en/latest/index.html'

        # Query with a same rank over guides/index.html and index.html
        page_index.rank = -10
        page_index.save()
        page_guides.rank = -10
        page_guides.save()
        PageDocument().update(page_index)
        PageDocument().update(page_guides)

        search_params = {
            'project': project.slug,
            'version': version.slug,
            'q': '"content from"',
        }
        resp = self.get_search(api_client, search_params)
        assert resp.status_code == 200

        results = resp.data['results']
        assert len(results) == 2
        assert results[0]['path'] == '/en/latest/index.html'
        assert results[1]['path'] == '/en/latest/guides/index.html'
Esempio n. 21
0
def elastic_search(request):
    """Use Elasticsearch for global search."""
    user_input = UserInput(
        query=request.GET.get('q'),
        type=request.GET.get('type', 'project'),
        project=request.GET.get('project'),
        version=request.GET.get('version', LATEST),
        taxonomy=request.GET.get('taxonomy'),
        language=request.GET.get('language'),
    )
    results = ''
    facets = {}

    if user_input.query:
        if user_input.type == 'project':
            project_search = ProjectDocument.faceted_search(
                query=user_input.query,
                user=request.user,
                language=user_input.language)
            results = project_search.execute()
            facets = results.facets
        elif user_input.type == 'file':
            kwargs = {}
            if user_input.project:
                kwargs['projects_list'] = [user_input.project]
            if user_input.version:
                kwargs['versions_list'] = [user_input.version]

            page_search = PageDocument.faceted_search(query=user_input.query,
                                                      user=request.user,
                                                      **kwargs)
            results = page_search.execute()
            facets = results.facets

        log.info(
            LOG_TEMPLATE.format(
                user=request.user,
                project=user_input.project or '',
                type=user_input.type or '',
                version=user_input.version or '',
                language=user_input.language or '',
                msg=user_input.query or '',
            ), )

    if results:
        if user_input.type == 'file':
            # Change results to turn newlines in highlight into periods
            # https://github.com/rtfd/readthedocs.org/issues/5168
            for result in results:
                if hasattr(result.meta.highlight, 'content'):
                    result.meta.highlight.content = [
                        result.replace('\n', '. ')
                        for result in result.meta.highlight.content
                    ]

        log.debug('Search results: %s', pformat(results.to_dict()))
        log.debug('Search facets: %s', pformat(results.facets.to_dict()))

    template_vars = user_input._asdict()
    template_vars.update({'results': results, 'facets': facets})
    return render(
        request,
        'search/elastic_search.html',
        template_vars,
    )