def process_api_page(page_path): return render_template('api.html', page=get_api_page(build_mode, page_path))
def build_search_indices(site_structure, pages): page_views_statistic = get_page_views_statistic() index_objects = [] print("Start building index") for url, endpoint in site_structure: if (not url.endswith('.html')) and (not url.endswith('/')): continue print("Processing " + url) if url in page_views_statistic: page_views = page_views_statistic[url] else: page_views = 0 page_path = get_page_path_from_url(url) if endpoint == 'page': page_part = pages.get(page_path) page_type = "Page" if page_path.startswith('community'): page_type = 'Community' elif page_path.startswith('docs/reference'): page_type = 'Reference' elif page_path.startswith('docs/tutorials'): page_type = 'Tutorial' index_objects += get_markdown_page_index_objects( page_part.parsed_html, url, page_path, page_part.meta['title'], page_type, page_views ) elif endpoint == "api_page": page_info = get_api_page(True, page_path[4:]) for table in page_info['content']('table'): table.extract() for overload_group in page_info['content'].findAll("div", {"class": "signature"}): overload_group.extract() breadcrumbs = page_info['content'].find("div", {"class": "api-docs-breadcrumbs"}) full_name = page_info['title'] if breadcrumbs is not None: full_name_parts = list(map(lambda link: link.text, breadcrumbs.findAll("a"))) if "kotlin-stdlib" in full_name_parts: full_name_parts.remove("kotlin-stdlib") else: full_name_parts.remove("kotlin.test") full_name = " › ".join(full_name_parts).replace('<', '<').replace('>', '>') breadcrumbs.extract() type = "Standard Library" if "jvm/stdlib" in url else "Kotlin Test" index_objects += get_page_index_objects(page_info['content'], url, page_path, full_name, type, page_views) elif endpoint in ["coroutines_redirect", "coroutines_tutor_redirect", "events_redirect", "community_redirect", "compatibility_redirect", "collections_redirect", "community_user_groups_redirect"]: continue else: client = app.test_client() content = client.get(url, follow_redirects=True) if content.status_code != 200: raise Exception('Bad response during indexing') parsed = BeautifulSoup(content.data, "html.parser") title = parsed.find("title").text content = parsed.find("div", {"class": "page-content"}) if content is None: content = parsed.find("article", {"class": "page-content"}) if content is None: index_objects.append({ 'objectID': page_path, 'type': 'Page', 'headings': title, 'url': url, 'content': '', 'pageViews': page_views }) else: index_objects += get_page_index_objects( content, url, page_path, title, "Page", page_views ) print("Index objects successfully built") index = get_index() print("Submitting index objects to " + index.index_name + " index") index.add_objects(index_objects)
def build_search_indices(pages, version): page_views_statistic = get_page_views_statistic() index_objects = [] wh_index_objects = [] print("Start building index") for url, endpoint in pages: if url.endswith('/'): url += 'index.html' if not url.endswith('.html'): continue title = '' content = '' page_type = 'Page' page_path = get_page_path_from_url(url) page_views = 0 if url in page_views_statistic: page_views = page_views_statistic[url] if page_path.startswith('community'): page_type = 'Community' elif page_path.startswith('docs/reference'): page_type = 'Reference' elif page_path.startswith('docs/tutorials'): page_type = 'Tutorial' html_content = get_page_content(url) parsed = BeautifulSoup(html_content, "html.parser") if parsed.find("meta", {"http-equiv": "refresh"}): continue if page_path.startswith("api/latest/"): page_info = get_api_page(True, page_path[4:], dist_path) for table in page_info['content']('table'): table.extract() for overload_group in page_info['content'].findAll( "div", {"class": "signature"}): overload_group.extract() breadcrumbs = page_info['content'].find( "div", {"class": "api-docs-breadcrumbs"}) title = page_info['title'] if breadcrumbs is not None: full_name_parts = list( map(lambda link: link.text, breadcrumbs.findAll("a"))) if "kotlin-stdlib" in full_name_parts: full_name_parts.remove("kotlin-stdlib") else: full_name_parts.remove("kotlin.test") title = " › ".join(full_name_parts).replace('<', '<').replace( '>', '>') breadcrumbs.extract() page_type = "Standard Library" if "jvm/stdlib" in url else "Kotlin Test" content = page_info['content'].find('article', {"role": "main"}) else: body_title = parsed.select_one("body[data-search-title]") if body_title: title = body_title.attrs["data-search-title"] if not title: title_node = parsed.find("title") if title_node: title = title_node.text # Our default pages content = parsed.find("div", {"class": "page-content"}) # Our modern pages if content is None: content = parsed.find("article", {"class": "page-content"}) # WebHelp pages if content is None: content = parsed.find("article", {"class": "article"}) if title and content: page_indexer = get_page_index_objects if parsed.select_one("body[data-article-props]"): page_type = "Documentation" page_indexer = get_webhelp_page_index_objects elif page_type == "Page": page_indexer = get_markdown_page_index_objects print("processing " + url + ' - ' + page_type) page_indices = page_indexer(content, url, page_path, title, page_type, page_views) index_objects += page_indices def wh(*args): return to_wh_index(version, *args) wh_index_objects += list(map(wh, page_indices.copy())) else: print('skip: ' + url + ' unknown page content in with title: ' + title) wh_index = get_wh_index() if wh_index: print("Submitting WH index objects to " + wh_index.index_name + " index") wh_index.add_objects(wh_index_objects) print("Index objects successfully built") index = get_index() print("Submitting index objects to " + index.index_name + " index") index.add_objects(index_objects)
def process_api_page(page_path): return render_template( 'api.html', page=get_api_page(page_path) )
def build_search_indices(site_structure, pages): page_views_statistic = get_page_views_statistic() index_objects = [] print("Start building index") for url, endpoint in site_structure: if (not url.endswith('.html')) and (not url.endswith('/')): continue print("Processing " + url) if url in page_views_statistic: page_views = page_views_statistic[url] else: page_views = 0 page_path = get_page_path_from_url(url) if endpoint == 'page': page_part = pages.get(page_path) page_type = "Page" if page_path.startswith('community'): page_type = 'Community' elif page_path.startswith('docs/reference'): page_type = 'Reference' elif page_path.startswith('docs/tutorials'): page_type = 'Tutorial' index_objects += get_markdown_page_index_objects( page_part.parsed_html, url, page_path, page_part.meta['title'], page_type, page_views ) elif endpoint == "api_page": page_info = get_api_page(page_path[4:]) for table in page_info['content']('table'): table.extract() for overload_group in page_info['content'].findAll("div", {"class": "signature"}): overload_group.extract() breadcrumbs = page_info['content'].find("div", {"class": "api-docs-breadcrumbs"}) full_name = page_info['title'] if breadcrumbs is not None: full_name_parts = list(map(lambda link: link.text, breadcrumbs.findAll("a"))) if "kotlin-stdlib" in full_name_parts: full_name_parts.remove("kotlin-stdlib") else: full_name_parts.remove("kotlin-test") full_name = " › ".join(full_name_parts).replace('<', '<').replace('>', '>') breadcrumbs.extract() type = "Standard Library" if "jvm/stdlib" in url else "Kotlin Test" index_objects += get_page_index_objects(page_info['content'], url, page_path, full_name, type, page_views) elif endpoint in ["coroutines_alias", "events_redirect", "community_redirect"]: continue else: client = app.test_client() content = client.get(url, follow_redirects=True) if content.status_code != 200: raise Exception('Bad response during indexing') parsed = BeautifulSoup(content.data, "html.parser") title = parsed.find("title").text content = parsed.find("div", {"class": "page-content"}) if content is None: content = parsed.find("article", {"class": "page-content"}) if content is None: index_objects.append({ 'objectID': page_path, 'type': 'Page', 'headings': title, 'url': url, 'content': '', 'pageViews': page_views }) else: index_objects += get_page_index_objects( content, url, page_path, title, "Page", page_views ) print("Index objects successfully built") index = get_index() print("Submitting index objects to " + index.index_name + " index") index.add_objects(index_objects)