Ejemplo n.º 1
0
def post2list(ele: PageElement):
    post_list = []

    headers = ele.find('div', class_='hd')('li')
    for header in headers:
        post_list.append({
            'name': header.a.text,
            'link': header.a['href'],
            'children': []
        })

    uls = ele.find('div', class_='bd')('ul')
    for i in range(len(post_list)):
        for li in uls[i]('li'):
            post_list[i]['children'].append({
                'name':
                ''.join(li('a')[-1].text.split()),
                'link':
                li('a')[-1]['href'],
                'new':
                True if li.img else False,
                'date':
                li.span.text if li.span else ''
            })

    return post_list
Ejemplo n.º 2
0
def parse_video_block(video_block: PageElement) -> Dict:
    video_object = {}
    video_title_el = video_block.find("h3")
    video_object["video_title"] = str(video_title_el.string) if video_title_el else None
    video_link_el = video_block.find(class_ = "btn-link video-sources video-download-button")
    video_object["video_link"] = video_link_el["href"] if video_link_el else None
    transcript_link_el = video_block.select(".wrapper-download-transcripts a")
    video_object["transcript_link"] = set()
    for srt_link in transcript_link_el:
        srt_url = srt_link["href"]
        u = urlparse(srt_url)
        if not u.scheme:
            u = u._replace(scheme='https')
        if not u.netloc:
            u = u._replace(netloc='courses.edx.org')
        srt_url = urlunparse(u)
        video_object["transcript_link"].add(srt_url)
    video_object["transcript_link"] = list(video_object["transcript_link"])
    return video_object
Ejemplo n.º 3
0
def get_html_table_header_and_rows(
        table: bs4.PageElement) -> Tuple[List, List]:
    """
    return header and rows from a html table as a list
    """
    header = []
    rows = []
    table_header = table.find("tr")
    table_rows = table.find_all("tr")[1:]
    for items in table_header:
        header.append(items.get_text())

    for table_row in table_rows:
        row = []
        for cell in table_row.findAll(['th', 'td']):
            row.append(cell)
        rows.append(row)

    return header, rows
Ejemplo n.º 4
0
def get_element_with_comment(container: PageElement,
                             comment: str) -> PageElement:
    return container.find(
        text=lambda t: _find_comment(t, comment)).find_parent()