def test_doc_search_pagination(self, api_client, project): """Test Doc search result can be paginated""" latest_version = project.versions.all()[0] html_file = HTMLFile.objects.filter(version=latest_version)[0] title = html_file.processed_json['title'] query = title.split()[0] # Create 60 more same html file for _ in range(60): # Make primary key to None, so django will create new object html_file.pk = None html_file.save() PageDocument().update(html_file) search_params = { 'q': query, 'project': project.slug, 'version': latest_version.slug } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 # Check the count is 61 (1 existing and 60 new created) assert resp.data['count'] == 61 # Check there are next url assert resp.data['next'] is not None # There should be only 50 data as the pagination is 50 by default assert len(resp.data['results']) == 50 # Add `page_size` parameter and check the data is paginated accordingly search_params['page_size'] = 5 resp = self.get_search(api_client, search_params) assert resp.status_code == 200 assert len(resp.data['results']) == 5
def test_facted_page_xss(self, client, project): query = 'XSS' page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() expected = """ <h3><span>XSS</span> exploit</h3> """.strip() hits = results.hits.hits assert len(hits) == 1 # there should be only one result inner_hits = hits[0]['inner_hits'] domain_hits = inner_hits['domains']['hits']['hits'] assert len( domain_hits) == 0 # there shouldn't be any results from domains section_hits = inner_hits['sections']['hits']['hits'] assert len(section_hits) == 1 section_content_highlight = section_hits[0]['highlight'][ 'sections.content'] assert len(section_content_highlight) == 1 assert expected in section_content_highlight[0]
def elastic_search(request): """Use Elasticsearch for global search.""" user_input = UserInput( query=request.GET.get('q'), type=request.GET.get('type', 'project'), project=request.GET.get('project'), version=request.GET.get('version', LATEST), taxonomy=request.GET.get('taxonomy'), language=request.GET.get('language'), ) results = '' facets = {} if user_input.query: if user_input.type == 'project': project_search = ProjectDocument.faceted_search( query=user_input.query, language=user_input.language) results = project_search.execute() facets = results.facets elif user_input.type == 'file': kwargs = {} if user_input.project: projects_list = get_project_list_or_404( project_slug=user_input.project, user=request.user) project_slug_list = [project.slug for project in projects_list] kwargs['projects_list'] = project_slug_list if user_input.version: kwargs['versions_list'] = user_input.version page_search = PageDocument.faceted_search(query=user_input.query, **kwargs) results = page_search.execute() facets = results.facets if settings.DEBUG: print(pprint(results)) print(pprint(facets)) if user_input.query: user = '' if request.user.is_authenticated(): user = request.user log.info( LOG_TEMPLATE.format( user=user, project=user_input.project or '', type=user_input.type or '', version=user_input.version or '', language=user_input.language or '', msg=user_input.query or '', )) template_vars = user_input._asdict() template_vars.update({'results': results, 'facets': facets}) return render( request, 'search/elastic_search.html', template_vars, )
def test_doc_search_filter_by_version(self, api_client, project): """Test Doc search result are filtered according to version""" query = get_search_query_from_project_file(project_slug=project.slug) latest_version = project.versions.all()[0] # Create another version dummy_version = G( Version, project=project, active=True, privacy_level=PUBLIC, ) # Create HTMLFile same as the latest version latest_version_files = HTMLFile.objects.all().filter( version=latest_version) for f in latest_version_files: f.version = dummy_version # Make primary key to None, so django will create new object f.pk = None f.save() PageDocument().update(f) search_params = { 'q': query, 'project': project.slug, 'version': dummy_version.slug } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 data = resp.data['results'] assert len(data) == 1 assert data[0]['project'] == project.slug
def test_facted_page_xss(self, client, project): query = 'XSS' page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() expected = """ <h3><em>XSS</em> exploit</h3> """.strip() assert results[0].meta.highlight.content[0][:len(expected)] == expected
def test_facted_page_xss(self, client, project): query = 'XSS' page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() expected = """ <h3><em>XSS</em> exploit</h3> """.strip() assert results[0].meta.highlight.content[0][:len(expected)] == expected
def all_projects(es_index, mock_processed_json, db, settings): settings.ELASTICSEARCH_DSL_AUTOSYNC = True projects_list = [] for project_slug in ALL_PROJECTS: project = get( Project, slug=project_slug, name=project_slug, main_language_project=None, privacy_level=PUBLIC, ) project.versions.update( privacy_level=PUBLIC, built=True, active=True, ) for file_basename in PROJECT_DATA_FILES[project.slug]: # file_basename in config are without extension so add html extension file_name = file_basename + '.html' version = project.versions.all()[0] html_file = get( HTMLFile, project=project, version=version, name=file_name, path=file_name, build=1, ) # creating sphinx domain test objects file_path = get_json_file_path(project.slug, file_basename) if os.path.exists(file_path): with open(file_path) as f: data = json.load(f) domains = data['domains'] for domain_data in domains: domain_role_name = domain_data.pop('role_name') domain, type_ = domain_role_name.split(':') get(SphinxDomain, project=project, version=version, html_file=html_file, domain=domain, type=type_, **domain_data) PageDocument().update(html_file) projects_list.append(project) shuffle(projects_list) return projects_list
def get_queryset(self): """ Return Elasticsearch DSL Search object instead of Django Queryset. Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. So for searching, its possible to return ``Search`` object instead of queryset. The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` """ # Validate all the required params are there self.validate_query_params() query = self.request.query_params.get('query', '') queryset = PageDocument.simple_search(query=query) return queryset
def test_doc_search_subprojects_default_version(self, api_client, all_projects): """Return results from subprojects that match the version from the main project or fallback to its default version.""" project = all_projects[0] version = project.versions.all()[0] feature, _ = Feature.objects.get_or_create( feature_id=Feature.SEARCH_SUBPROJECTS_ON_DEFAULT_VERSION, ) project.feature_set.add(feature) subproject = all_projects[1] subproject_version = subproject.versions.all()[0] # Change the name of the version, and make it default. subproject_version.slug = 'different' subproject_version.save() subproject.default_version = subproject_version.slug subproject.save() subproject.versions.filter(slug=version.slug).delete() # Refresh index version_files = HTMLFile.objects.all().filter( version=subproject_version) for f in version_files: PageDocument().update(f) # Add another project as subproject of the project project.add_subproject(subproject) # Now search with subproject content but explicitly filter by the parent project query = get_search_query_from_project_file( project_slug=subproject.slug) search_params = { 'q': query, 'project': project.slug, 'version': version.slug } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 data = resp.data['results'] assert len(data) >= 1 # there may be results from another projects # First result should be the subproject first_result = data[0] assert first_result['project'] == subproject.slug assert first_result['version'] == 'different' # Check the link is the subproject document link document_link = subproject.get_docs_url( version_slug=subproject_version.slug) link = first_result['domain'] + first_result['path'] assert document_link in link
def get_queryset(self): """ Return Elasticsearch DSL Search object instead of Django Queryset. Django Queryset and elasticsearch-dsl ``Search`` object is similar pattern. So for searching, its possible to return ``Search`` object instead of queryset. The ``filter_backends`` and ``pagination_class`` is compatible with ``Search`` """ # Validate all the required params are there self.validate_query_params() query = self.request.query_params.get('q', '') kwargs = {'filter_by_user': False} kwargs['projects_list'] = [p.slug for p in self.get_all_projects()] kwargs['versions_list'] = self.request.query_params.get('version') user = self.request.user queryset = PageDocument.faceted_search(query=query, user=user, **kwargs) return queryset
def test_search_exact_match(self, client, project, case): """Check quoted query match exact phrase with case insensitively Making a query with quoted text like ``"foo bar"`` should match exactly ``foo bar`` or ``Foo Bar`` etc """ # `Github` word is present both in `kuma` and `pipeline` files # But the phrase Github can is available only in kuma docs. # So search with this phrase to check query_text = r'"GitHub can"' cased_query = getattr(query_text, case) query = cased_query() page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() assert len(results) == 1 assert results[0]['project'] == 'kuma' assert results[0]['path'] == 'documentation'
def test_search_exact_match(self, client, project, case): """Check quoted query match exact phrase with case insensitively Making a query with quoted text like ``"foo bar"`` should match exactly ``foo bar`` or ``Foo Bar`` etc """ # `Github` word is present both in `kuma` and `pipeline` files # But the phrase Github can is available only in kuma docs. # So search with this phrase to check query_text = r'"GitHub can"' cased_query = getattr(query_text, case) query = cased_query() page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() assert len(results) == 1 assert results[0]['project'] == 'kuma' assert results[0]['path'] == 'documentation'
def test_search_combined_result(self, client, project): """Check search result are combined of both `AND` and `OR` operator If query is `Foo Bar` then the result should be as following order: - Where both `Foo Bar` is present - Where `Foo` or `Bar` is present """ query = 'Official Support' page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() assert len(results) == 3 result_paths = [r.path for r in results] # ``open-source-philosophy`` page has both ``Official Support`` words # ``docker`` page has ``Support`` word # ``installation`` page has ``Official`` word expected_paths = ['open-source-philosophy', 'docker', 'installation'] assert result_paths == expected_paths
def all_projects(es_index, mock_processed_json, db, settings): settings.ELASTICSEARCH_DSL_AUTOSYNC = True projects_list = [] for project_slug in ALL_PROJECTS: project = G(Project, slug=project_slug, name=project_slug) for file_basename in PROJECT_DATA_FILES[project.slug]: # file_basename in config are without extension so add html extension file_name = file_basename + '.html' version = project.versions.all()[0] html_file = G(HTMLFile, project=project, version=version, name=file_name) PageDocument().update(html_file) projects_list.append(project) shuffle(projects_list) return projects_list
def test_search_combined_result(self, client, project): """Check search result are combined of both `AND` and `OR` operator If query is `Foo Bar` then the result should be as following order: - Where both `Foo Bar` is present - Where `Foo` or `Bar` is present """ query = 'Official Support' page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() assert len(results) == 3 result_paths = [r.path for r in results] # ``open-source-philosophy`` page has both ``Official Support`` words # ``docker`` page has ``Support`` word # ``installation`` page has ``Official`` word expected_paths = ['open-source-philosophy', 'docker', 'installation'] assert result_paths == expected_paths
def test_search_combined_result(self, client, project): """Check search result are combined of both `AND` and `OR` operator If query is `Foo Bar` then the result should be as following order: - Where both `Foo Bar` is present - Where `Foo` or `Bar` is present """ query = 'Elasticsearch Query' page_search = PageDocument.faceted_search(query=query, user='') results = page_search.execute() assert len(results) == 3 result_paths = [r.path for r in results] # ``guides/wipe-environment`` page has both ``Elasticsearch Query`` words # ``docker`` page has ``Elasticsearch`` word # ``installation`` page has ``Query`` word. expected_paths = ['guides/wipe-environment', 'docker', 'installation'] assert result_paths == expected_paths
def test_search_correct_link_for_index_page_subdirectory_htmldir_projects(self, api_client, doctype): project = Project.objects.get(slug='docs') project.versions.update(documentation_type=doctype) version = project.versions.all().first() # Refresh index version_files = HTMLFile.objects.all().filter(version=version) for f in version_files: PageDocument().update(f) search_params = { 'project': project.slug, 'version': version.slug, 'q': 'Some content from guides/index', } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 result = resp.data['results'][0] assert result['project'] == project.slug assert result['path'] == '/en/latest/guides/'
def elastic_project_search(request, project_slug): """Use elastic search to search in a project.""" queryset = Project.objects.protected(request.user) project = get_object_or_404(queryset, slug=project_slug) version_slug = request.GET.get('version', LATEST) query = request.GET.get('q', None) results = None if query: user = '' if request.user.is_authenticated: user = request.user log.info( LOG_TEMPLATE.format( user=user, project=project or '', type='inproject', version=version_slug or '', language='', msg=query or '', ), ) if query: req = PageDocument.simple_search(query=query) filtered_query = ( req.filter('term', project=project.slug) .filter('term', version=version_slug) ) paginated_query = filtered_query[:50] results = paginated_query.execute() return render( request, 'search/elastic_project_search.html', { 'project': project, 'query': query, 'results': results, }, )
def elastic_project_search(request, project_slug): """Use elastic search to search in a project.""" queryset = Project.objects.protected(request.user) project = get_object_or_404(queryset, slug=project_slug) version_slug = request.GET.get('version', LATEST) query = request.GET.get('q', None) results = None if query: kwargs = {} kwargs['projects_list'] = [project.slug] kwargs['versions_list'] = version_slug page_search = PageDocument.faceted_search(query=query, user=request.user, **kwargs) results = page_search.execute() log.debug('Search results: %s', pformat(results.to_dict())) log.debug('Search facets: %s', pformat(results.facets.to_dict())) log.info( LOG_TEMPLATE.format( user=request.user, project=project or '', type='inproject', version=version_slug or '', language='', msg=query or '', ), ) return render( request, 'search/elastic_project_search.html', { 'project': project, 'query': query, 'results': results, }, )
def test_search_custom_ranking(self, api_client): project = Project.objects.get(slug='docs') version = project.versions.all().first() page_index = HTMLFile.objects.get(path='index.html') page_guides = HTMLFile.objects.get(path='guides/index.html') # Query with the default ranking assert page_index.rank == 0 assert page_guides.rank == 0 search_params = { 'project': project.slug, 'version': version.slug, 'q': '"content from"', } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 results = resp.data['results'] assert len(results) == 2 assert results[0]['path'] == '/en/latest/index.html' assert results[1]['path'] == '/en/latest/guides/index.html' # Query with a higher rank over guides/index.html page_guides.rank = 5 page_guides.save() PageDocument().update(page_guides) search_params = { 'project': project.slug, 'version': version.slug, 'q': '"content from"', } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 results = resp.data['results'] assert len(results) == 2 assert results[0]['path'] == '/en/latest/guides/index.html' assert results[1]['path'] == '/en/latest/index.html' # Query with a lower rank over index.html page_index.rank = -2 page_index.save() page_guides.rank = 4 page_guides.save() PageDocument().update(page_index) PageDocument().update(page_guides) search_params = { 'project': project.slug, 'version': version.slug, 'q': '"content from"', } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 results = resp.data['results'] assert len(results) == 2 assert results[0]['path'] == '/en/latest/guides/index.html' assert results[1]['path'] == '/en/latest/index.html' # Query with a lower rank over index.html page_index.rank = 3 page_index.save() page_guides.rank = 6 page_guides.save() PageDocument().update(page_index) PageDocument().update(page_guides) search_params = { 'project': project.slug, 'version': version.slug, 'q': '"content from"', } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 results = resp.data['results'] assert len(results) == 2 assert results[0]['path'] == '/en/latest/guides/index.html' assert results[1]['path'] == '/en/latest/index.html' # Query with a same rank over guides/index.html and index.html page_index.rank = -10 page_index.save() page_guides.rank = -10 page_guides.save() PageDocument().update(page_index) PageDocument().update(page_guides) search_params = { 'project': project.slug, 'version': version.slug, 'q': '"content from"', } resp = self.get_search(api_client, search_params) assert resp.status_code == 200 results = resp.data['results'] assert len(results) == 2 assert results[0]['path'] == '/en/latest/index.html' assert results[1]['path'] == '/en/latest/guides/index.html'
def elastic_search(request): """Use Elasticsearch for global search.""" user_input = UserInput( query=request.GET.get('q'), type=request.GET.get('type', 'project'), project=request.GET.get('project'), version=request.GET.get('version', LATEST), taxonomy=request.GET.get('taxonomy'), language=request.GET.get('language'), ) results = '' facets = {} if user_input.query: if user_input.type == 'project': project_search = ProjectDocument.faceted_search( query=user_input.query, user=request.user, language=user_input.language) results = project_search.execute() facets = results.facets elif user_input.type == 'file': kwargs = {} if user_input.project: kwargs['projects_list'] = [user_input.project] if user_input.version: kwargs['versions_list'] = [user_input.version] page_search = PageDocument.faceted_search(query=user_input.query, user=request.user, **kwargs) results = page_search.execute() facets = results.facets log.info( LOG_TEMPLATE.format( user=request.user, project=user_input.project or '', type=user_input.type or '', version=user_input.version or '', language=user_input.language or '', msg=user_input.query or '', ), ) if results: if user_input.type == 'file': # Change results to turn newlines in highlight into periods # https://github.com/rtfd/readthedocs.org/issues/5168 for result in results: if hasattr(result.meta.highlight, 'content'): result.meta.highlight.content = [ result.replace('\n', '. ') for result in result.meta.highlight.content ] log.debug('Search results: %s', pformat(results.to_dict())) log.debug('Search facets: %s', pformat(results.facets.to_dict())) template_vars = user_input._asdict() template_vars.update({'results': results, 'facets': facets}) return render( request, 'search/elastic_search.html', template_vars, )