def post2list(ele: PageElement): post_list = [] headers = ele.find('div', class_='hd')('li') for header in headers: post_list.append({ 'name': header.a.text, 'link': header.a['href'], 'children': [] }) uls = ele.find('div', class_='bd')('ul') for i in range(len(post_list)): for li in uls[i]('li'): post_list[i]['children'].append({ 'name': ''.join(li('a')[-1].text.split()), 'link': li('a')[-1]['href'], 'new': True if li.img else False, 'date': li.span.text if li.span else '' }) return post_list
def parse_video_block(video_block: PageElement) -> Dict: video_object = {} video_title_el = video_block.find("h3") video_object["video_title"] = str(video_title_el.string) if video_title_el else None video_link_el = video_block.find(class_ = "btn-link video-sources video-download-button") video_object["video_link"] = video_link_el["href"] if video_link_el else None transcript_link_el = video_block.select(".wrapper-download-transcripts a") video_object["transcript_link"] = set() for srt_link in transcript_link_el: srt_url = srt_link["href"] u = urlparse(srt_url) if not u.scheme: u = u._replace(scheme='https') if not u.netloc: u = u._replace(netloc='courses.edx.org') srt_url = urlunparse(u) video_object["transcript_link"].add(srt_url) video_object["transcript_link"] = list(video_object["transcript_link"]) return video_object
def get_html_table_header_and_rows( table: bs4.PageElement) -> Tuple[List, List]: """ return header and rows from a html table as a list """ header = [] rows = [] table_header = table.find("tr") table_rows = table.find_all("tr")[1:] for items in table_header: header.append(items.get_text()) for table_row in table_rows: row = [] for cell in table_row.findAll(['th', 'td']): row.append(cell) rows.append(row) return header, rows
def get_element_with_comment(container: PageElement, comment: str) -> PageElement: return container.find( text=lambda t: _find_comment(t, comment)).find_parent()