コード例 #1
0
ファイル: parser.py プロジェクト: sdobz/avalarky.report
def linkify_soup(soup, new_tag):
    assert hasattr(soup, 'contents')
    tags = set()
    old_elements = [e for e in soup.contents]
    for element in old_elements:
        if not isinstance(element, NavigableString):
            tags = tags.union(linkify_soup(element, new_tag))
            continue

        segments = tag_re.split(element)

        if len(segments) <= 1:
            continue

        insertion_target = element

        for segment in segments:
            if len(segment) > 0:
                if tag_re.match(segment) is None:
                    new_e = NavigableString(segment)
                else:
                    new_e = new_tag("a", href='tag/{}.html'.format(segment))
                    new_e.string = segment
                    tags.add(segment[1:])
                insertion_target.insert_after(new_e)
                insertion_target = new_e

        element.extract()

    return tags
コード例 #2
0
def scrapeError(url: str,
                elem: str,
                attr: Tuple[str, str],
                field_err: List[str],
                auth: bool = False) -> str:
    """
    Scrapes input error from argument: url
    """
    if auth:
        token = b64decode(session["token"])
        token = str(token, encoding="utf-8")
        web_page = requests.request(
            'GET',
            url,
            headers={'User-Agent': f"{request.user_agent}"},
            params={"token": token},
            allow_redirects=False)
    else:
        web_page = requests.request(
            'GET',
            url,
            headers={'User-Agent': f"{request.user_agent}"},
            allow_redirects=False)
    soup = BeautifulSoup(web_page.content)
    elem_tag = soup.find_all(f'{elem}', {f"{attr[0]}": f"{attr[1]}"})
    for i in elem_tag:
        for err in field_err:
            i.insert(0, NavigableString(f"- {err}\n"))
            error = i
            return error
コード例 #3
0
    def test_ins5(self):
        self.soup.div.insert(0, NavigableString("Not a BeautifulSoup object"))

        self.assertEqual(
            str(self.soup.div),
            '<div class="second">Not a BeautifulSoup object'
            'Second element</div>')
コード例 #4
0
ファイル: __init__.py プロジェクト: m3brown/cfgov-refresh
def add_link_markup(tags):

    for tag in tags:
        added_icon = False
        if not tag.attrs.get('class', None):
            tag.attrs.update({'class': []})
        if tag['href'].startswith('/external-site/?'):
            components = urlparse(tag['href'])
            arguments = parse_qs(components.query)
            if 'ext_url' in arguments:
                external_url = arguments['ext_url'][0]
                tag['href'] = signed_redirect(external_url)

        elif NONCFPB_LINK_PATTERN.match(tag['href']):
            # Sets the icon to indicate you're leaving consumerfinance.gov
            tag.attrs['class'].append(EXTERNAL_A_CSS)
            if EXTERNAL_LINK_PATTERN.match(tag['href']):

                tag['href'] = signed_redirect(tag['href'])

            added_icon = True
        elif DOWNLOAD_LINK_PATTERN.search(tag['href']):
            # Sets the icon to indicate you're downloading a file
            tag.attrs['class'].append(DOWNLOAD_A_CSS)
            added_icon = True
        if added_icon:
            # Wraps the link text in a span that provides the underline
            contents = tag.contents
            span = BeautifulSoup('', 'html.parser').new_tag('span')
            span['class'] = EXTERNAL_SPAN_CSS
            span.contents = contents
            tag.contents = [span, NavigableString(' ')]
        elif not FILES_LINK_PATTERN.match(tag['href']):
            fix_link(tag)
コード例 #5
0
def generate_stats_with_values(stats_with_values: List[cards.StatWithValue]) -> Tag:
    for stat_with_value in stats_with_values:
        swv_section = Tag(name="section")
        swv_section['class'] = "stat-with-value"
        swv_stat = Tag(name="label")
        swv_section.append(swv_stat)
        swv_stat.append(NavigableString(stat_with_value.stat))
        swv_value = Tag(name="data")
        swv_section.append(swv_value)
        swv_value['value'] = stat_with_value.value
        swv_value.append(NavigableString(stat_with_value.value))
        if stat_with_value.unit is not None:
            small = Tag(name="small")
            swv_value.append(small)
            small.append(NavigableString(stat_with_value.unit))
        yield swv_section
コード例 #6
0
 def _recursive_replace(self, tag):
     if hasattr(tag, "contents"):  # noqa: WPS421 (builtin function call, special cases only)
         for index, child in enumerate(tag.contents):
             if child.name == "code":
                 tag.contents[index] = NavigableString(self.store(str(child)))
             else:
                 self._recursive_replace(child)
コード例 #7
0
def insert_paragraph_breaks(soup):
    """Identify <br> and <hr> and split their parent element into multiple elements where appropriate."""
    # Indicator which is used as a placeholder to mark paragraph breaks
    BREAK_INDICATOR = "|BREAK_HERE|"

    # Find consecutive <br> elements and replace with a break marker
    for element in soup.find_all('br'):
        # When the next element is not another <br> count how long the chain is
        if (element.next_sibling is None) or (element.next_sibling.name !=
                                              'br'):
            br_element_chain = [element]
            while (br_element_chain[-1].previous_sibling
                   is not None) and (br_element_chain[-1].previous_sibling.name
                                     == 'br'):
                br_element_chain.append(br_element_chain[-1].previous_sibling)

            # If there's only one <br> then we replace it with a space
            if len(br_element_chain) == 1:
                br_element_chain[0].replace_with(' ')
            # If there are multiple <br>s then replace them with BREAK_INDICATOR
            else:
                br_element_chain[0].replace_with(BREAK_INDICATOR)
                for inner_element in br_element_chain[1:]:
                    inner_element.decompose()

    # Find consecutive <hr> elements and replace with a break marker
    # Use a list rather than the generator, since we are altering the tree as we traverse it
    for element in list(soup.find_all('hr')):
        element.replace_with(BREAK_INDICATOR)

    # Consolidate the text again now that we have added strings to the tree
    consolidate_text(soup)

    # Iterate through the tree, splitting string elements which contain BREAK_INDICATOR
    # Use a list rather than the generator, since we are altering the tree as we traverse it
    for element in list(soup.find_all(string=True)):
        if BREAK_INDICATOR in element:
            # Split the text into two or more fragments (there maybe be multiple BREAK_INDICATORs in the string)
            text_fragments = [
                s.strip() for s in str(element).split(BREAK_INDICATOR)
            ]

            # Get the parent element
            parent_element = element.parent

            # If the parent is a paragraph then we want to close and reopen by creating a new tag
            if parent_element.name == "p":
                # Iterate in reverse order as we are repeatedly adding new elements directly after the original one
                for text_fragment in text_fragments[:0:-1]:
                    new_p_element = soup.new_tag("p")
                    new_p_element.string = text_fragment
                    parent_element.insert_after(new_p_element)
                # Replace this element by a navigable string containing the first text fragment
                element.replace_with(NavigableString(text_fragments[0]))
            # Otherwise we want to simply include all the text fragments as independent NavigableStrings (that will be wrapped later)
            else:
                # Iterate in reverse order as we are repeatedly adding new elements directly after the original one
                for text_fragment in text_fragments[:0:-1]:
                    element.insert_after(soup.new_string(text_fragment))
                element.string.replace_with(text_fragments[0])
コード例 #8
0
def construct_fields_for_each_file(files, soup):
    for file in files:
        fields = etree.SubElement(file, 'fields')
        table = soup.find(attrs={'data-text': file[0].text}).findNext('table')
        tags = [
            format_tag(tag.text)
            for tag in table.find('thead').find('tr').find_all('th',
                                                               recursive=False)
        ]
        for row in table.find('tbody').find_all('tr', recursive=False):
            field = etree.SubElement(fields, 'field')
            for tag, value in zip(tags, row.find_all('td', recursive=False)):
                # rename field_name
                if tag == 'field_name':
                    attribute = etree.SubElement(field, 'identifier')
                else:
                    attribute = etree.SubElement(field, tag)
                for unwanted_table in value.find_all('table'):
                    unwanted_div = soup.new_tag('div')
                    unwanted_div.append(' (table has been removed) ')
                    unwanted_table.insert_after(unwanted_div)
                    unwanted_table.decompose()
                for li in value.find_all('li'):
                    li.insert(0, NavigableString(' • '))
                attribute.text = format(value.get_text(separator=' '))
        # add names to each field
        for field in fields:
            attribute = etree.Element('name')
            attribute.text = format_name(field[0].text)
            field.insert(1, attribute)
    return files
コード例 #9
0
def extract_text_from_is_html(html: Union[str, TextIO]) -> str:
    """Extract article text from Ilta-Sanomat article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    elem = soup.select_one(
        'article.single-article,article.article--m,article.article--l,article.article--xl-picture-top,article.article--xl-title-top'
    )
    if elem is None:
        raise ValueError("Article layout not recognized")
    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in elem.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = elem.get_text().strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt
コード例 #10
0
ファイル: main.py プロジェクト: fwolf/rss-to-telegram
def clean_tags(tag, post):
    if NavigableString == type(tag):
        tag.string.replace_with(escape(tag))
        return tag

    # 递归,先处理子结点,不然 unwrap 后本结点就变性了
    for child in tag.contents :
        clean_tags(child, post)

    if 'img' == tag.name :
        if 'src' in tag.attrs :
            post['images'].append(fix_url(tag.attrs['src']))
        if 'title' in tag.attrs :
            title = tag.attrs['title']
            tag.insert_before(NavigableString('[' + title + ']'))

    if tag.name not in ['b', 'i', 'a', 'code', 'pre']:
        tag.unwrap()

    if tag.name == 'a' :
        allowed_attrs = {}
        for key in tag.attrs :
            if key in KEEP_ATTRIBUTES :
                allowed_attrs[key] = tag.attrs[key]
        tag.attrs = allowed_attrs

    return tag
コード例 #11
0
def urlize(data):
    """Urlize plain text links in the HTML contents.

    Do not urlize content of A and CODE tags.

    """

    soup = BeautifulSoup(data, 'lxml')
    for found_string in soup.find_all(string=exclude_code_tag):
        new_content = []
        strings_or_tags = found_string.parent.contents
        for string_or_tag in strings_or_tags:
            try:
                for string in PLAIN_LINK_RE.split(string_or_tag):
                    if string.startswith('http'):
                        # Apply an a-Tag
                        tag = soup.new_tag('a')
                        tag['href'] = string
                        tag.string = string
                        tag['nofollow'] = 'true'
                        new_content.append(tag)
                    else:
                        # This is just a string, apply a bs4-string
                        new_content.append(NavigableString(string))
            except:
                # Regex failed, so apply what ever it is
                new_content.append(string_or_tag)

        # Apply the new content
        found_string.parent.contents = new_content

    return str(soup)
コード例 #12
0
ファイル: editor.py プロジェクト: freyp567/geeknote
    def checklistInSoupToENML(soup):
        '''
        Transforms github style checklists `* [ ]` in the BeautifulSoup tree to ENML.
        '''

        checktodo_re = re.compile(r'\[([ x])\]')

        # To be more github compatible, if all elements in a list begin with '[ ]',
        # convert them to en-todo evernote elements
        for ul in soup.find_all('ul'):
            tasks = []
            istodo = True

            for li in ul.find_all('li'):
                task = soup.new_tag('div')
                todo_tag = soup.new_tag('en-todo')

                reg = checktodo_re.match(li.get_text())
                istodo = istodo and reg
                character = reg.group(1) if reg else None
                if character == "x":
                    todo_tag['checked'] = "true"

                task.append(todo_tag)
                if reg:
                    task.append(NavigableString(li.get_text()[3:].strip()))
                tasks.append(task)

            if istodo:
                for task in tasks[::-1]:
                    ul.insert_after(task)
                ul.extract()
コード例 #13
0
    def get_chapter_title(self, soup):
        heading = soup.new_tag('h2')
        heading['class'] = 'chapter-heading'
        title = ''
        chapter_number = None

        if self.chap_title_css:
            tag = soup.select_one(self.chap_title_css)
            chapter_details = re.match(
                r'(chapter\s+(\d+))[:\-\s]*([\w\s\'\-\d:.,]*)',
                tag.string,
                flags=re.IGNORECASE)
            try:
                chapter_number = chapter_details.group(2)
                title = chapter_details.group(3)
            except AttributeError:
                title = ''

        self.current_chapter = chapter_number if chapter_number else int(
            self.current_chapter) + 1
        chap_title = 'Chapter ' + str(self.current_chapter)
        chap_title += (' - ' + title) if title else ''
        heading.string = NavigableString(chap_title)

        if self.debug:
            print(chap_title)

        return heading
コード例 #14
0
ファイル: results.py プロジェクト: benbusby/whoogle-search
    def replace_any_case(element: NavigableString, target_word: str) -> None:
        # Replace all instances of the word, but maintaining the same case in
        # the replacement
        if len(element) == len(target_word):
            return

        if not re.match('.*[a-zA-Z0-9].*', target_word) or (
                element.parent and element.parent.name == 'style'):
            return

        element.replace_with(BeautifulSoup(
            re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
                   r'<b>\1</b>',
                   html.escape(element),
                   flags=re.I), 'html.parser')
        )
コード例 #15
0
 def link(self):
     soup = BeautifulSoup("", 'html5lib')
     link = soup.new_tag('a', href="#bibliography" + str(self.ordering))
     link['class'] = "bibliography-reference"
     link['data-ordering'] = str(self.ordering)
     link.insert(0, NavigableString(self.content_title))
     return link
コード例 #16
0
def add_replacement_links(p, keys, soup, bib):
    """
    Given a paragraph object and possible bibtex keys, add a replacement link to the paragraph object

    :param Tag p: BS Paragraph object
    :param list(str) keys: List of citation keys
    :param BeautifulSoup soup: Beautiful Soup object
    :param dict(str, str) bib: Dictionary created from BibTeX
    """
    p.append(NavigableString('['))
    for i, key in enumerate(keys):
        p.append(create_link_from_entry(soup, bib, key))
        if i + 1 == len(keys):
            p.append(NavigableString('] '))
        else:
            p.append(NavigableString(', '))
コード例 #17
0
def extract_text_from_svyle_html(html: Union[str, TextIO]) -> str:
    """Extract article text from Svenska YLE article HTML

    Args:
        html (Union[str,TextIO]): a string or a file-like object containing the article HTML

    Raises:
        ValueError: The layout of the article was not recognized, or the article parsed as empty

    Returns:
        str: article text
    """
    soup = BeautifulSoup(html, 'lxml')
    elem = soup.select_one('article#main-content')
    if elem is None:
        raise ValueError("Article layout not recognized")
    for elem_to_remove in soup.select('aside#id-article__tags'):
        elem_to_remove.extract()
    for elem_to_remove in soup.select('#comments'):
        elem_to_remove.extract()
    for elem_to_remove in soup.select('.ydd-share-buttons'):
        elem_to_remove.extract()

    for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'div']:
        for block_elem in elem.find_all(tag):
            block_elem.insert_after(NavigableString('\n\n'))
    txt = elem.get_text().strip()
    if txt == "":
        raise ValueError("Parsing results in an empty article")
    return txt
コード例 #18
0
 def format(self, article, subscriber, codes=None):
     formatted_article = deepcopy(article)
     pub_seq_num = superdesk.get_resource_service(
         'subscribers').generate_sequence_number(subscriber)
     doc = {}
     try:
         # If there is a dateline inject it into the body
         if formatted_article.get(
                 FORMAT) == FORMATS.HTML and formatted_article.get(
                     'dateline', {}).get('text'):
             soup = BeautifulSoup(formatted_article.get('body_html'),
                                  "html.parser")
             ptag = soup.find('p')
             if ptag is not None:
                 ptag.insert(
                     0,
                     NavigableString('{} '.format(
                         formatted_article.get('dateline').get('text'))))
                 formatted_article['body_html'] = str(soup)
             doc['message_html'] = render_template(
                 'email_article_body.html', article=formatted_article)
         else:
             doc['message_html'] = None
         doc['message_text'] = render_template('email_article_body.txt',
                                               article=formatted_article)
         doc['message_subject'] = render_template(
             'email_article_subject.txt', article=formatted_article)
     except Exception as ex:
         raise FormatterError.EmailFormatterError(ex, FormatterError)
     return [(pub_seq_num, json.dumps(doc))]
コード例 #19
0
    def _parse(self, html):
#        print(html)
        soup = BeautifulSoup(html.decode('utf-8'), "html5lib")
        
        for comment in soup.find_all(text=lambda text: isinstance(text, Comment)):
            comment.extract()

        self.meta = soup.find_all('meta')
        try:
            soup.find('meta', attrs={'name':'hdl'}).get('content')
            soup.find('meta', attrs={'name':'dat'}).get('content')
            soup.find('meta', attrs={'name':'byl'}).get('content')
        except AttributeError:
            self.real_article = False
            # return

        try:
            p_tags = list(soup.find("article", {"id":"story"}).find_all('p'))
        except:
            print(html)
            return 
        div = soup.find('div', attrs={'class': 'story-addendum story-content theme-correction'})
        if div:
            p_tags += [div]
        footer = soup.find('footer', attrs={'class':'story-footer story-content'})
        if footer:
            p_tags += list(footer.find_all(lambda x: x.get('class') != 'story-print-citation' and x.name == 'p'))

        p_contents = reduce(operator.concat, [p.contents + [NavigableString('\n')] for p in p_tags], [])

        body_strings  = []
        for node in p_contents:
            if type(node) is NavigableString:
                body_strings.append(node)
            else:
                if node.name == 'br':
                    body_strings.append(' \n ')
                else:
                    try:
                        body_strings.append(node.get_text())
                    except:
                        body_strings.append(node)

        main_body = ''.join(body_strings)

#        authorids = soup.find('div', attrs={'class':'authorIdentification'})
#        authorid = authorids.getText() if authorids else ''

        top_correction = ' '.join(x.getText() for x in
                                   soup.find_all('nyt_correction_top')) or ' '
        bottom_correction = ' '.join(x.getText() for x in
                                   soup.find_all('nyt_correction_bottom')) or ' '

        self.body = '\n'.join([top_correction,
                               main_body,
#                               authorid,
                               bottom_correction,])
#        print(self.body)
                        
コード例 #20
0
def add_link_markup(tag):
    """Add necessary markup to the given link and return if modified.

    Add an external link icon if the input is not a CFPB (internal) link.
    Add an external link redirect if the input is not a gov link.
    If it contains a descendent that should not get an icon, return the link.
    If not, add a download icon if the input is a file.
    Otherwise (internal link that is not a file), return None.
    """
    icon = False

    tag = BeautifulSoup(tag, 'html.parser').find('a', href=True)

    if tag is None:
        return None

    if not tag.attrs.get('class', None):
        tag.attrs.update({'class': []})

    if tag['href'].startswith('/external-site/?'):
        # Sets the icon to indicate you're leaving consumerfinance.gov
        icon = 'external-link'
        components = urlparse(tag['href'])
        arguments = parse_qs(components.query)
        if 'ext_url' in arguments:
            external_url = arguments['ext_url'][0]
            # Add the redirect notice as well
            tag['href'] = signed_redirect(external_url)

    elif NON_CFPB_LINKS.match(tag['href']):
        # Sets the icon to indicate you're leaving consumerfinance.gov
        icon = 'external-link'
        if NON_GOV_LINKS.match(tag['href']):
            # Add the redirect notice as well
            tag['href'] = signed_redirect(tag['href'])

    elif DOWNLOAD_LINKS.search(tag['href']):
        # Sets the icon to indicate you're downloading a file
        icon = 'download'

    if tag.select(', '.join(ICONLESS_LINK_CHILD_ELEMENTS)):
        # If this tag has any children that are in our list of child elements
        # that should not get an icon, it doesn't get the icon. It might still
        # be an external link and modified accordingly above.
        return str(tag)

    if icon:
        tag.attrs['class'].append(LINK_ICON_CLASSES)
        # Wraps the link text in a span that provides the underline
        contents = tag.contents
        span = BeautifulSoup('', 'html.parser').new_tag('span')
        span['class'] = LINK_ICON_TEXT_CLASSES
        span.contents = contents
        tag.contents = [span, NavigableString(' ')]
        # Appends the SVG icon
        tag.contents.append(BeautifulSoup(svg_icon(icon), 'html.parser'))
        return str(tag)

    return None
コード例 #21
0
def generate_sub_figure(typed_thing: Union[cards.Action, cards.Card]) -> Tag:
    figure = Tag(name="figure")
    figcaption = Tag(name="figcaption")
    h2 = Tag(name="h2", attrs={'class': 'type'})
    h2.append(NavigableString(typed_thing.sub_type.title()))
    figcaption.append(h2)
    figure.append(figcaption)
    return figure
コード例 #22
0
    def replace_a(cls, element):

        if isinstance(element, NavigableString):
            return

        if element.name == "a":
            label = "{removed href}"
            element.replaceWith(NavigableString(f"{label} {element.text}"))
コード例 #23
0
ファイル: generate.py プロジェクト: b-01/startpage
def _inline_image(image_tag: PageElement, image_file: Path) -> bool:
    """ replacement callable to replace img tags for inline_data """

    image_content = "data:image/png;base64," + base64.b64encode(
        image_file.read_bytes()).decode("utf-8")
    image_content = NavigableString(image_content)

    image_tag["src"] = image_content
コード例 #24
0
def create_link_from_entry(soup, bib, key):
    """
    Creates link/replacement text for bibtex entry

    :param BeautifulSoup soup: Beautiful Soup object
    :param dict bib: Dictionary created from BibTeX
    :param str key: Single citation key
    :return: Link or text to replace citation
    :rtype: Tag|NavigableString
    """
    # Define possible tags to use
    b_tag = soup.new_tag('b')

    try:
        entry = bib[key]
    except KeyError:
        click.echo('Entry {} not found in bibtex file!'.format(key))
        b_tag.append('{}'.format(key))
        return b_tag

    # Info for entry: <Author> et al. <Year>.
    try:
        author = NavigableString(entry['author'].split(',')[0] + ' et al. ' +
                                 entry['year'] + '. ')
    except KeyError:
        click.echo(
            'Author not found for bibtex key: "{}", using key instead'.format(
                key))
        author = NavigableString(key + ' ')

    # Use DOI if available
    if 'doi' in entry.keys():
        link = soup.new_tag('a', href='https://doi.org/' + entry['doi'])
        link.append(author)
        return link

    # Otherwise use URL
    elif 'url' in entry.keys():
        link = soup.new_tag('a', href=entry['url'])
        link.append(author)
        return link

    # Else return author
    else:
        b_tag.append(author)
        return b_tag
コード例 #25
0
ファイル: linker.py プロジェクト: mverleg/notex_pkgs
	def __call__(self, soup, style='tmp-json-style'):
		order_map = {identifier: index + 1 for index, identifier in enumerate(self.config.reference_counts.keys())}
		for bio_tag in soup.find_all('notex-bibliography'):
			if len(self.config.citations):
				ol_tag = BeautifulSoup.new_tag(bio_tag, 'ol', **{'class': 'bibliography-list'})
				for identifier, count in self.config.reference_counts.items():
					if identifier in self.config.citations:
						li_tag = BeautifulSoup.new_tag(soup, 'li', **{
							'id': 'cite-{0:d}'.format(order_map[identifier]),
							'class': 'citation-details',
							'ref-count': count,
						})
						li_tag.append(NavigableString(self.config.citations[identifier].render(style)))
						ol_tag.append(li_tag)
					else:
						#todo: logging
						print('reference to citation "{0:s}" which is not defined'.format(identifier))
						li_tag = BeautifulSoup.new_tag(soup, 'li')
						li_tag.append(NavigableString('unknown citation "{0:s}"'.format(identifier)))
						ol_tag.append(li_tag)
				bio_tag.append(ol_tag)
		tag_names = set()
		if self.config.has_ci_tag:
			tag_names.add('reference-ci')
		if self.config.has_cite_tag:
			tag_names.add('reference-cite')
		if tag_names:
			for ref_tag in soup.find_all(tag_names):
				identifier = ref_tag.attrs['data-identifier']
				if identifier in self.config.citations:
					citation = self.config.citations[identifier]
					ref_tag.attrs['class'].append('citstyle-{0:s}'.format(style))
					index = order_map[identifier]
					ref_tag.parent.attrs['href'] = '#cite-{0:d}'.format(index)
					if ref_tag.name == 'reference-cite':
						marker = BeautifulSoup.new_tag(ref_tag, 'cite')
						marker.append(NavigableString(self.config.citations[identifier].ref_title()))
					else:
						marker = NavigableString(citation.ref_name(index, style))
					ref_tag.append(marker)
				else:
					#todo: logging
					print('reference "{0:s}" not found'.format(identifier))
					ref_tag.attrs['class'].extend(['citstyle-{0:s}'.format(style),
						'citation-not-found', 'not-found'])
					ref_tag.append(NavigableString('[reference "{0:s}" ??]'.format(identifier)))
コード例 #26
0
def clone_element(element: Union[Tag, NavigableString]) -> Union[Tag, NavigableString]:
    " Create a deep copy of an element from a BeautifulSoup tree. "
    if isinstance(element, Tag):
        new_element = create_empty_element_copy(element)
        for child in element.children:
            new_element.append(clone_element(child))
        return new_element

    return NavigableString(str(element))
コード例 #27
0
 def recursive_replace(tag):
     if hasattr(tag, "contents"):
         for i in range(len(tag.contents)):
             child = tag.contents[i]
             if child.name == "code":
                 tag.contents[i] = NavigableString(
                     self.store(str(child)))
             else:
                 recursive_replace(child)
コード例 #28
0
def format_indents(soup):
    """ Needs clean up """
    for indent in soup.find_all('indent'):
        if not len(indent.contents):
            indent.decompose()

    prev_left = None
    for indent in soup.find_all('indent'):
        match = None
        if indent.find('sml-image'):
            indent.find('sml-image').extract()
            indent.insert(0, NavigableString(u'• '))

        if isinstance(indent.contents[0], NavigableString):
            match = listlike_reg.match(indent.contents[0])

        left = get_left(indent)
        if not match or (prev_left and left > prev_left):
            if indent.previous_sibling and indent.previous_sibling and indent.previous_sibling.name == 'list':
                text = indent.previous_sibling.find_all('text')[-1]
                text.append(' ')
                for c in indent.contents[:]:
                    text.append(c)
                indent.decompose()
                continue

        indent.name = 'entry'
        if match:
            text = soup.new_tag('text')
            text.string = match.group(2)

            indent.contents[0].replace_with('')

            for c in indent.contents[:]:
                text.append(c)
            insert = 0
            if match.group(1) != u'•':
                label = soup.new_tag('label')
                label.string = match.group(1)
                indent.insert(insert, label)
                insert += 1
            indent.insert(insert, text)
        else:
            text = soup.new_tag('text')
            for c in indent.contents[:]:
                text.append(c)
            indent.insert(0, text)

        if not (indent.previous_sibling
                and indent.previous_sibling.name == 'list'):
            new_list = soup.new_tag('list')
            entry = indent.replace_with(new_list)
            new_list.append(entry)
        else:
            indent.previous_sibling.append(indent)
        indent.attrs = {}
        prev_left = left
コード例 #29
0
 def set_html(self):
     if self.mime_data.hasHtml():
         markup = self.mime_data.html()
         soup = BeautifulSoup(markup, "html.parser")
         for inner_text in list(soup.strings):
             inner_text.replace_with(
                 NavigableString(self.modify_text(inner_text)))
         self.fin_mime_data.setHtml(str(soup))
         self.format_list.remove("text/html")
コード例 #30
0
def generate_latex_from_element(element: NavigableString, payload: dict):
    # if this is no tag, this is pure text
    if isinstance(element, str):
        if element.strip() == "":
            return ""
        return sanitize_string(element)
    data = process_symbols(element, payload,
                           get_latex_for_element(element.name))
    return data
コード例 #31
0
def process_bibtex2html_output(bibtex2html_output, d):
    """ 
        From the bibtex2html output, get clean version. 
    """
    #    frag = bs(bibtex2html_output)
    frag = BeautifulSoup(bibtex2html_output, 'html.parser')

    with open(os.path.join(d, 'fixed_interpreted.html'), 'w') as f:
        f.write(str(frag))

    res = Tag(name='div')

    ids = []
    for dt in list(frag.select('dt')):
        assert dt.name == 'dt'
        name = dt.a.attrs['name']
        name = 'bib:' + name
        ids.append(name)
        dd = dt.findNext('dd')
        assert dd.name == 'dd'
        entry = dd.__copy__()
        entry.name = 'cite'
        entry.attrs['id'] = name

        try_to_replace_stuff = True
        if try_to_replace_stuff:
            for x in list(entry.descendants):
                if isinstance(x, NavigableString):
                    s = x.string.encode('utf-8')
                    s = s.replace('\n', ' ')
                    s = s.replace('[', '')
                    s = s.replace('|', '')
                    s = s.replace(']', '')
                    y = NavigableString(unicode(s, 'utf-8'))
                    x.replace_with(y)
                    #print('string %r' % x.string)
                if isinstance(x, Tag) and x.name == 'a' and x.string == 'bib':
                    x.extract()
        res.append(NavigableString('\n'))
        res.append(entry)
        res.append(NavigableString('\n'))
    res.attrs['id'] = 'bibliography_entries'
    logger.info('Found %d bib entries.' % len(ids))
    return str(res)