예제 #1
0
def post2list(ele: PageElement):
    post_list = []

    headers = ele.find('div', class_='hd')('li')
    for header in headers:
        post_list.append({
            'name': header.a.text,
            'link': header.a['href'],
            'children': []
        })

    uls = ele.find('div', class_='bd')('ul')
    for i in range(len(post_list)):
        for li in uls[i]('li'):
            post_list[i]['children'].append({
                'name':
                ''.join(li('a')[-1].text.split()),
                'link':
                li('a')[-1]['href'],
                'new':
                True if li.img else False,
                'date':
                li.span.text if li.span else ''
            })

    return post_list
예제 #2
0
def _parseTable(table: PageElement):

    print('table =======>', table)
    tb = dict({
        "table": table.get('name', ''),
        "numRows": table.get('numrows', ''),
        #"remarks": table.get('remarks', '')
    })

    cols = table.findAllNext('column')

    def _parseCol(col: PageElement):
        col_dict = {
            "col_name": col.get('name', 'unknown name'),
            "col_remarks": col.get('remarks', ''),
            "autoUpdated": col.get('autoupdated', ''),
            "nullable": col.get('nullable', ''),
        }
        return col_dict

    columns = list(map(_parseCol, cols))
    cols = {"columns": list(columns)}
    tab = {"tb": tb}

    return {**tab, **cols}
예제 #3
0
def make_cover(soup: PageElement, options: Options):
    """ Generate a cover page.

    Arguments:
        soup {BeautifulSoup} -- target element.
        options {Options} -- the project options.
    """

    if not options.cover:
        return

    options.logger.info('Generate a cover page.')

    article = soup.new_tag('article', id='doc-cover')

    d = soup.new_tag('div', **{'class': 'wrapper'})
    article.append(d)

    box = soup.new_tag('div', **{'class': 'wrapper'})
    article.append(box)

    title = options.cover_title
    h1 = soup.new_tag('h1')
    h1.append(title)
    box.append(h1)

    sub_title = options.cover_subtitle
    if sub_title:
        h2 = soup.new_tag('h2')
        h2.append(sub_title)
        box.append(h2)

    article.append(_gen_address(soup, options))

    soup.body.insert(0, article)
예제 #4
0
def convert_for_two_columns(soup: PageElement,
                            level: int,
                            logger: Logger = None):
    if level == 0:
        return
    elif level != 3:
        if logger:
            logger.warning('`two_columns_level` is only support `3` yet.')
        return

    if logger:
        logger.info('Converting for two-column layout(heading level 3).')

    ignored = []
    for el in soup.find_all('h3'):
        if el in ignored:
            continue
        els = [
            i for i in itertools.takewhile(
                lambda x: x.name not in ['h1', 'h2'], el.next_siblings)
        ]
        section = soup.new_tag('section',
                               **{'class': 'md-typeset two-columns'})
        el.wrap(section)
        for tag in els:
            section.append(tag)
            if tag.name == 'h3':
                ignored.append(tag)
        images_size_to_half_in(section)
예제 #5
0
 def _parseCol(col: PageElement):
     col_dict = {
         "col_name": col.get('name', 'unknown name'),
         "col_remarks": col.get('remarks', ''),
         "autoUpdated": col.get('autoupdated', ''),
         "nullable": col.get('nullable', ''),
     }
     return col_dict
예제 #6
0
def replace_asset_hrefs(soup: PageElement, base_url: str) -> PageElement:
    """makes all relative asset links absolute"""

    for link in soup.find_all('link', href=True):
        link['href'] = abs_asset_href(link['href'], base_url)

    for asset in soup.find_all(src=True):
        asset['src'] = abs_asset_href(asset['src'], base_url)

    return soup
예제 #7
0
파일: generate.py 프로젝트: b-01/startpage
def _inline_script(script_tag: PageElement, script_file: Path) -> bool:
    """ replacement callable to replace scripts for inline_data """

    script_content = NavigableString(script_file.read_text())

    new_script_tag = BeautifulSoup(features="html.parser").new_tag("script")
    new_script_tag.insert(0, script_content)
    new_script_tag["type"] = "text/javascript"

    script_tag.replaceWith(new_script_tag)
예제 #8
0
파일: generate.py 프로젝트: b-01/startpage
def _inline_css(style_tag: PageElement, style_file: Path) -> bool:
    """ replacement callable to replace stylesheets for inline_data """

    style_content = NavigableString(style_file.read_text())

    new_style_tag = BeautifulSoup(features="html.parser").new_tag("style")
    new_style_tag.insert(0, style_content)
    new_style_tag["type"] = "text/css"

    style_tag.replaceWith(new_style_tag)
예제 #9
0
 def __serialize_listing(self, listing_element: PageElement):
     title_element = listing_element.find_next(class_="listing_header")
     title = title_element.text
     price_text = listing_element.find_next(class_="price").text
     price = int(re.sub("[^0-9]", "", price_text))
     body = listing_element.find_next(class_="body").text
     listing = BarnstormersClassifiedListing()
     listing.title = title
     listing.price = price
     listing.description = body
     listing.url = self.base_url + title_element["href"]
     return listing
예제 #10
0
 def __parse_result_item(self, result_item: PageElement):
     result_title = result_item.find_next(id='title')
     price_text = re.sub("[^0-9]", "",
                         result_item.find_next(class_='txt-price').text)
     price = 0
     description = result_item.find_next(class_='description').text.strip()
     url = self.base_url + result_title['href']
     if len(price_text) != 0:
         price = int(price_text)
     return TradeAPlaneListing(title=result_title.text.strip(),
                               price=price,
                               description=description,
                               url=url)
예제 #11
0
def wrap_tabbed_set_content(soup: PageElement, logger: Logger = None):
    for ts in soup.select('div.tabbed-set'):
        for radio in ts.select('input'):
            els = [i for i in itertools.takewhile(
                lambda x: x.name not in ['input'],
                radio.next_siblings)]
            wrapper = soup.new_tag('div', **{'class': 'tabbed-content--wrap'})
            radio.wrap(wrapper)
            for tag in els:
                wrapper.append(tag)

    for d in soup.select('details'):
        d['open'] = ''
예제 #12
0
def visit_and_hyphenate(
        node: bs4.PageElement) -> Optional[List[bs4.PageElement]]:
    """Visits HTML nodes and hyphenates text.

    Returns:
        Children of tag elements that should be further processed, e.g., <pre>
        elements are skipped.
    """
    if isinstance(node, bs4.Comment):
        return None

    # We check whether `Stylesheet` is implemented, because it's a
    # relatively recent addition to BeautifulSoup
    # (https://bazaar.launchpad.net/~leonardr/beautifulsoup/bs4/revision/564).
    # In case it is not, we don't skip <style> nodes. This will mangle
    # stylesheets if they exist, but that is a cost I'm willing to take.
    if (is_stylesheet_implemented()
            and isinstance(node, bs4.element.Stylesheet)):
        return None

    if isinstance(node, bs4.Tag):
        if node.name == 'pre':
            return None
        if node.name == 'style':
            return None
        return node.children

    if not isinstance(node, bs4.NavigableString):
        return None

    # My intention is to remove silent-hyphens, so that language detection
    # works correctly.
    printable_text = only_printable(node)
    if should_ignore(printable_text):
        return None

    try:
        lang = langdetect.detect(printable_text)
        if lang == 'en':
            # Use US dictionary for English, because it seems that the US
            # dictionary is richer. For example en_GB doesn't hyphenate
            # "format," but US does ("for-mat").
            lang = 'en_US'
        dic = pyphen.Pyphen(lang=lang)
    except (langdetect.lang_detect_exception.LangDetectException, KeyError):
        return None

    new_text = hyphenate_end_node(dic, node)
    node.replaceWith(new_text)
    return None
예제 #13
0
def convert_iframe(soup: PageElement, entries: list, logger: Logger = None):
    """Replace iFrame to a(anchor)

    e.g:
        ```html "before:"
        <iframe frameborder="0" height="100%" src="SRC"/>
        ```

        ```html "after:"
        <a class="converted-iframe" href="SRC" target="_blank">
          <img src="POSTER IMAGE"/>
        </a>
        ```
    """

    if len(entries) < 1:
        return

    if logger:
        logger.info('Converting <iframe> to poster image(if available).')

    for iframe in soup.find_all('iframe', src=True):
        for entry in entries:
            if iframe['src'] != entry.get('src'):
                continue

            a = soup.new_tag('a',
                             href=iframe['src'],
                             target='_blank',
                             **{'class': 'converted-iframe'})
            img_src = entry.get('img')
            if img_src:
                a.append(soup.new_tag('img', src=img_src))
            text = entry.get('text')
            if text:
                span = soup.new_tag('span')
                span.string = text
                a.append(span)

            # copy attributes
            for key, val in iframe.attrs.items():
                if key in ['style']:
                    a[key] = val

            iframe.replace_with(a)
예제 #14
0
 def _format_vacancy(item: PageElement):
     data = {
         "site_id":
         int(
             item.find_next(
                 "a", {"class": "no-decoration"})['href'].split("/")[-2]),
         "title":
         item.find_next("a").get_text(),
         "company":
         item.find_next("div", {
             "class": "add-top-xs"
         }).find_next("b").get_text(),
         "desc":
         " ".join(
             unicodedata.normalize("NFKD",
                                   item.find_next("p").get_text()).split()),
         "salary":
         unicodedata.normalize("NFKD", a)
         if "грн" in (a := item.find_next("b").get_text()) else None,
         "city":
         "|".join([
             i.replace('\xa0', ' ')
             for i in item.find_next("div", {
                 "class": "add-top-xs"
             }).get_text().split("·")[1:]
         ]),
         "link":
         "https://work.ua" +
         item.find_next("a", {"class": "no-decoration"})['href']
     }
     return data
예제 #15
0
def get_html_table_header_and_rows(
        table: bs4.PageElement) -> Tuple[List, List]:
    """
    return header and rows from a html table as a list
    """
    header = []
    rows = []
    table_header = table.find("tr")
    table_rows = table.find_all("tr")[1:]
    for items in table_header:
        header.append(items.get_text())

    for table_row in table_rows:
        row = []
        for cell in table_row.findAll(['th', 'td']):
            row.append(cell)
        rows.append(row)

    return header, rows
예제 #16
0
def fix_twemoji(soup: PageElement, logger: Logger = None):
    """ (workaraound) replace <svg> to <img + b64encoded data/>

    cause, don't shown WeasyPrint 51
    for after material v4.5.0

    @see https://github.com/squidfunk/mkdocs-material/pull/1330
    """

    def fix_size(svg):
        '''
        svg['width'] = 24
        svg['height'] = 24
        '''
        viewbox = _parse_viewbox(svg['viewbox'])
        width, height = (
            viewbox[2] - viewbox[0],
            viewbox[3] - viewbox[1]
        )
        svg['width'] = int(width)
        svg['height'] = int(height)
        svg['style'] = 'fill: currentColor;'

    if logger:
        logger.debug('Converting emoji SVG to img(workaround).')

    for svg in soup.select('.twemoji svg'):
        try:
            fix_size(svg)
            encoded = b64encode(str(svg).encode('utf-8')).decode('ascii')
            data = "data:image/svg+xml;charset=utf-8;base64," + encoded
            img = soup.new_tag('img', src=data,
                               **{'class': 'converted-twemoji'})
            svg.replace_with(img)

            if logger:
                logger.debug(f'> svg: {svg}')
                logger.debug(f'< img: {img}')

        except Exception as e:
            if logger:
                logger.warning(f'Failed to convert SVG: {e}')
            pass
예제 #17
0
def _gen_address(soup: PageElement, options: Options) -> PageElement:

    box = soup.new_tag('div', **{'class': 'properties'})

    address = soup.new_tag('address')
    box.append(address)

    if options.author:
        span = soup.new_tag('p', id="author")
        span.append(options.author)
        address.append(span)

    if options.copyright:
        span = soup.new_tag('p', id="copyright")
        import html
        span.append(html.unescape(options.copyright))
        address.append(span)

    return box
예제 #18
0
def get_combined(soup: PageElement, base_url: str,
                 rel_url: str) -> PageElement:
    """ transforms all relative hrefs pointing to other html docs
    into relative pdf hrefs
    """

    for element in soup.find_all(id=True):
        element['id'] = transform_id(element['id'], rel_url)

    for a in soup.find_all('a', href=True):
        if urls.url_is_absolute(a['href']) or os.path.isabs(a['href']):
            continue

        a['href'] = transform_href(a['href'], rel_url)

    soup.body['id'] = get_body_id(rel_url)
    soup = replace_asset_hrefs(soup, base_url)

    return soup
예제 #19
0
def parse_video_block(video_block: PageElement) -> Dict:
    video_object = {}
    video_title_el = video_block.find("h3")
    video_object["video_title"] = str(video_title_el.string) if video_title_el else None
    video_link_el = video_block.find(class_ = "btn-link video-sources video-download-button")
    video_object["video_link"] = video_link_el["href"] if video_link_el else None
    transcript_link_el = video_block.select(".wrapper-download-transcripts a")
    video_object["transcript_link"] = set()
    for srt_link in transcript_link_el:
        srt_url = srt_link["href"]
        u = urlparse(srt_url)
        if not u.scheme:
            u = u._replace(scheme='https')
        if not u.netloc:
            u = u._replace(netloc='courses.edx.org')
        srt_url = urlunparse(u)
        video_object["transcript_link"].add(srt_url)
    video_object["transcript_link"] = list(video_object["transcript_link"])
    return video_object
예제 #20
0
 def _label(node: PageElement):
     if isinstance(node, NavigableString):
         text = node.strip()
         parent = node.parent
         if parent.name == 'a' and 'href' in parent.attrs:
             yield '[[[{}|||{}]]]'.format(text, parent.attrs['href'])
         else:
             yield text
     elif isinstance(node, Tag):
         for child in node.children:
             yield from _label(child)
예제 #21
0
def _parseTr(tr: PageElement):
    tds = tr.findAllNext("td")
    return {
        "Table": tds[0].text,
        "Column": tds[1].text,
        "Type": tds[2].text,
        "Size": tds[3].text,
        "Nulls": tds[4].text,
        "Auto": tds[5].text,
        "Default": tds[6].text
    }
예제 #22
0
 def _format_vacancy(item: PageElement):
     data = {
         "site_id":
         int(item["_id"]),
         "title":
         item.find_next("a", {
             "class": "vt"
         }).get_text(),
         "company":
         item.find_next("a", {
             "class": "company"
         }).get_text().replace('\xa0', ''),
         "desc":
         " ".join(
             unicodedata.normalize(
                 "NFKD",
                 item.find_next("div", {
                     "class": "sh-info"
                 }).get_text()).split()),
         "salary":
         unicodedata.normalize("NFKD", a.get_text()) if
         (a := item.find_next("span", {"class": "salary"})) else None,
         "city":
         item.find_next("span", {
             "class": "cities"
         }).get_text(),
         "link":
         item.find_next("a", {"class": "vt"})['href']
     }
     return data
예제 #23
0
 def _format_vacancy(item: PageElement):
     data = {
         "site_id":
         int(item["data-vacancy-id"]),
         "title":
         item.find_next("a", {
             "class": "ga_listing"
         }).get_text().replace("\n", ""),
         "company":
         item.find_next("a", {
             "class": "company-profile-name"
         }).get_text(),
         "desc":
         item.find_next("div", {
             "class": "card-description"
         }).get_text(),
         "salary":
         a if (a := item.find_next("span", {
             "class": "salary"
         }).get_text()) else None,
         "city":
         item.find_next("span", {
             "class": "location"
         }).get_text(),
         "link":
         "https://robota.ua" +
         item.find_next("a", {"class": "ga_listing"})['href']
     }
     return data
예제 #24
0
    def _remove_empty_tags(self, soup: PageElement):
        def is_blank(el):
            if len(el.get_text(strip=True)) != 0:
                return False
            elif el.find(['img', 'svg']):
                return False
            else:
                return True

        includes = ['article', 'p']
        while True:
            hit = False
            for x in soup.find_all():
                if x.name in includes and is_blank(x):
                    # self.logger.debug(f'Strip: {x}')
                    x.extract()
                    hit = True
            if not hit:
                break
예제 #25
0
def fix_image_alignment(soup: PageElement, logger: Logger = None):
    """ (workaraound) convert <img align=*> to `float` style.
    and, move <img width=*>, <image height=*> to style attributes.
    """

    if logger:
        logger.info('Converting <img> alignment(workaround).')

    for img in soup.select('img'):
        try:
            if img.has_attr('class') and 'twemoji' in img['class']:
                continue

            styles = _parse_style(getattr(img, 'style', ''))

            logger.debug(f'  | {img}')
            if img.has_attr('align'):
                if img['align'] == 'left':
                    styles['float'] = 'left'
                    styles['padding-right'] = '1rem'
                    styles['padding-bottom'] = '0.5rem'
                    img.attrs.pop('align')
                elif img['align'] == 'right':
                    styles['float'] = 'right'
                    styles['padding-left'] = '1rem'
                    styles['padding-bottom'] = '0.5rem'
                    img.attrs.pop('align')

            if img.has_attr('width'):
                styles['width'] = _convert_dimension(img['width'])
                img.attrs.pop('width')
            if img.has_attr('height'):
                styles['height'] = _convert_dimension(img['height'])
                img.attrs.pop('height')

            img['style'] = " ".join(f'{k}: {v};' for k, v in styles.items())
        except Exception as e:
            if logger:
                logger.warning(f'Failed to convert img align: {e}')
            pass
예제 #26
0
def is_group_header(element: bs4.PageElement) -> bool:
    return is_tag(element) and 'groupheader' in element.get('class', ())
예제 #27
0
    def _get_content(self, soup: PageElement, page):

        def shift_heading(elem, page):
            for i in range(7, 0, -1):
                while True:
                    h = elem.find(f'h{i}')
                    if not h:
                        break
                    h.name = f'h{i + 1}'

            page_path = self._page_path_for_id(page)
            h1 = soup.new_tag('h1', id=f'{page_path}')
            h1.append(page.title)
            elem.insert(0, h1)
            return elem

        def cleanup_class(classes: []):
            if classes and len(classes):
                excludes = ['md-content__inner']
                return [c for c in classes if not (c in excludes)]
            return classes

        article = getattr(page, 'pdf-article', None)
        if article:

            page_path = self._page_path_for_id(page)
            article['id'] = f'{page_path}:'  # anchor for each page.
            article['data-url'] = f'/{page_path}'
            return article

        elif page.children:

            new_article = soup.new_tag('article')
            found = False
            for c in page.children:
                content = self._get_content(soup, c)
                if content:
                    new_article.append(content)
                    found = True

            if not found:
                return None

            child_classes = None
            for child_article in new_article.find_all('article'):
                child_article.name = 'section'
                classes = child_article.get('class')
                if classes and not child_classes:
                    child_classes = classes
                child_article['class'] = cleanup_class(classes)

            page_path = self._page_path_for_id(page)
            new_article['id'] = f'{page_path}:'  # anchor for each page.
            new_article['data-url'] = f'/{page_path}'
            if child_classes:
                new_article['class'] = child_classes

            if self._options.heading_shift:
                return shift_heading(new_article, page)
            return new_article

        return None
예제 #28
0
def make_indexes(soup: PageElement, options: Options) -> None:
    """ Generate ordered chapter number and TOC of document.

    Arguments:
        soup {BeautifulSoup} -- DOM object of Document.
        options {Options} -- The options of this sequence.
    """

    # Step 1: (re)ordered headdings
    _inject_heading_order(soup, options)

    # Step 2: generate toc page
    level = options.toc_level
    if level < 1 or level > 3:
        return

    options.logger.info(
        f'Generate a table of contents up to heading level {level}.')

    h1li = None
    h2ul = h2li = h3ul = None
    exclude_lv2 = exclude_lv3 = False

    def makeLink(h: Tag) -> Tag:
        li = soup.new_tag('li')
        ref = h.get('id', '')
        a = soup.new_tag('a', href=f'#{ref}')
        for el in h.contents:
            if el.name == 'a':
                a.append(el.contents[0])
            else:
                a.append(clone_element(el))
        li.append(a)
        options.logger.debug(f"| [{h.get_text(separator=' ')}]({ref})")
        return li

    toc = soup.new_tag('article', id='doc-toc')
    title = soup.new_tag('h1')
    title.append(options.toc_title)
    toc.append(title)

    h1ul = soup.new_tag('ul')
    toc.append(h1ul)

    headings = soup.find_all(['h1', 'h2', 'h3'])
    for h in headings:

        if h.name == 'h1':

            h1li = makeLink(h)
            h1ul.append(h1li)
            h2ul = h2li = h3ul = None

            exclude_lv2 = _is_exclude(h.get('id', None), options)

        elif not exclude_lv2 and h.name == 'h2' and level >= 2:

            if not h2ul:
                h2ul = soup.new_tag('ul')
                h1li.append(h2ul)
            h2li = makeLink(h)
            h2ul.append(h2li)
            h3ul = None

            exclude_lv3 = _is_exclude(h.get('id', None), options)

        elif not exclude_lv2 and not exclude_lv3 \
                and h.name == 'h3' and level >= 3:

            if not h2li:
                continue
            if not h3ul:
                h3ul = soup.new_tag('ul')
                h2li.append(h3ul)
            h3li = makeLink(h)
            h3ul.append(h3li)

        else:
            continue
        pass

    soup.body.insert(0, toc)
예제 #29
0
def is_declaration(element: bs4.PageElement) -> bool:
    return (is_tag(element)
        and element.name == 'div' and 'memitem' in element.get('class', ())
    )
예제 #30
0
def get_element_with_comment(container: PageElement,
                             comment: str) -> PageElement:
    return container.find(
        text=lambda t: _find_comment(t, comment)).find_parent()
예제 #31
0
def get_separate(soup: PageElement, base_url: str) -> PageElement:
    for a in soup.find_all('a', href=True):
        a['href'] = rel_pdf_href(a['href'])

    soup = replace_asset_hrefs(soup, base_url)
    return soup