Exemple #1
0
    def run(self, cls):
        try:
            soup = cls.content
            # to HTML5
            [
                item.extract() for item in soup.contents
                if isinstance(item, Doctype)
            ]
            soup.insert(0, Doctype('html'))
            soup.html.attrs = {}
            soup.html['lang'] = 'pt-br'
            if not soup.head:
                soup.html.insert(0, soup.new_tag('head'))

            # HTML attrs
            soup.html.attrs = {}

            # Flagging epigrafe
            soup.select_one('p:nth-of-type(1)').attrs = {'class': 'epigrafe'}

            # Flagging ementa
            soup.select_one('p:nth-of-type(2)').attrs = {'class': 'ementa'}

            cls.content = soup
        except AttributeError:
            return False
        else:
            return True
def strip_html(path, i, label_xid=True):
    """Strip the HTML: get rid of scripts and interactions"""
    print '[{}] Reading {} ...'.format(i, path)
    with open(path, 'r', 'utf8') as fin:
        # TODO: Handle encodings
        soup = BeautifulSoup(fin.read(), 'html5lib')
    # Add doctype if missing
    if not has_doctype(soup):
        soup.insert(0, Doctype('html'))
    # Remove dangerous tags
    for x in soup('script'):
        x.extract()
    for x in soup('noscript'):
        x.extract()
    for x in soup('link'):
        if x.get('as') == 'script':
            x.extract()
    for x in soup('iframe'):
        x['src'] = ''
    # Fix styles
    for x in soup('style'):
        x.string = H.unescape(u"".join(unicode(y) for y in x.contents))
    # Label all tags
    i = 1
    for x in soup.body(True):
        for attr in list(x.attrs):
            if attr.startswith('on') or attr == 'srcset':
                del x[attr]
        if label_xid:
            x['data-xid'] = i
            i += 1
    # Return
    return soup.prettify()
Exemple #3
0
    def response(self, flow: http.HTTPFlow):
        response = flow.response
        if CONTENT_TYPE in response.headers:
            if any(
                    map(lambda t: t in response.headers[CONTENT_TYPE],
                        RELEVANT_CONTENT_TYPES)):
                # Response is a web page; proceed.
                insertedScripts: List[str] = []
                soup = BeautifulSoup(response.content,
                                     HTML_PARSER,
                                     from_encoding=inferEncoding(response))
                requestURL = flow.request.pretty_url  # should work in transparent mode too, unless the Host header is spoofed
                isApplicable: Callable[[Userscript],
                                       bool] = userscript.applicableChecker(
                                           requestURL)
                for script in self.userscripts:
                    if isApplicable(script):
                        useInline = ctx.options.inline or script.downloadURL is None
                        if useInline and len(script.unsafeSequences) > 0:
                            logError(unsafeSequencesMessage(script))
                            continue
                        logInfo(
                            f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ..."""
                        )
                        result = inject(
                            script, soup,
                            Options(
                                inline=ctx.options.inline,
                                verbose=ctx.options.verbose,
                            ))
                        if type(result) is BeautifulSoup:
                            soup = result
                            insertedScripts.append(script.name + (
                                "" if script.version is None else " " +
                                stringifyVersion(script.version)))
                        else:
                            logError(
                                "Injection failed due to the following error:")
                            logError(str(result))

                index_DTD: Optional[int] = indexOfDTD(soup)
                # Insert information comment:
                if ctx.options.verbose:
                    soup.insert(
                        0 if index_DTD is None else 1 + index_DTD,
                        Comment(INFO_COMMENT_PREFIX +
                                ("No matching userscripts for this URL."
                                 if insertedScripts ==
                                 [] else "These scripts were inserted:\n" +
                                 bulletList(insertedScripts)) + "\n"))
                # Prevent BS/html.parser from emitting `<!DOCTYPE doctype html>` or similar if "DOCTYPE" is not all uppercase in source HTML:
                if index_DTD is not None and REGEX_DOCTYPE.match(
                        soup.contents[index_DTD]):
                    # There is a DTD and it is invalid, so replace it.
                    soup.contents[index_DTD] = Doctype(
                        re.sub(REGEX_DOCTYPE, "", soup.contents[index_DTD]))
                # Serialize and encode:
                response.content = str(soup).encode(
                    fromOptional(soup.original_encoding, CHARSET_DEFAULT),
                    "replace")
Exemple #4
0
    def clean_text(self):
        text = self.cleaned_data["text"]

        soup = BeautifulSoup(text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        return str(soup)
Exemple #5
0
def bare_bones(site_name="", **kwargs):
    soup = BeautifulSoup("", "html.parser")
    soup.append(Doctype("html"))
    html = soup.new_tag("html", attrs={"lang": "en"})
    soup.append(html)
    html.append(soup.new_tag("head"))
    html.append(soup.new_tag("body", id="content"))
    apply_config(soup, site_name)
    return soup
def get_html_listing_soup(
    in_folder: Union[Path, str],
    page_title: Optional[str] = None,
    out_file: Optional[Union[Path, str]] = None,
) -> BeautifulSoup:

    in_folder = Path(in_folder)

    soup = BeautifulSoup("", "html5lib")
    cast(Tag, soup.find("html"))["lang"] = "en"

    soup.insert(0, Doctype("html"))

    if page_title is None:
        page_title = in_folder.stem

    head = cast(Tag, soup.find("head"))
    title = soup.new_tag("title")
    title.string = page_title
    head.append(title)

    body = cast(Tag, soup.find("body"))
    ul: Tag = soup.new_tag("ul")
    body.append(ul)

    now_sec = int(time.time())
    inlined_suffix_regex = re.compile(r"_inlined$")

    li: Tag
    for demo_full_path in sorted(in_folder.glob("**/*.html")):
        if demo_full_path.is_dir() or demo_full_path.name == "index.html":
            continue

        li = soup.new_tag("li")
        ul.append(li)

        demo_relative_path = urllib.parse.quote(str(
            demo_full_path.relative_to(in_folder)),
                                                safe="/")
        a = soup.new_tag(
            "a",
            href=(f"./{demo_relative_path}?t={now_sec}"),
        )

        demo_name = inlined_suffix_regex.sub("", demo_full_path.stem)
        a.string = demo_name
        li.append(a)

    if out_file is None:
        out_file = in_folder / "index.html"

    _ = Path(out_file).write_text(str(soup))

    return soup
 def __init__(self):
     super(OutputSoup, self).__init__(features='lxml')
     self.append(Doctype('html'))
     html = self.new_tag('html')
     self.append(html)
     head = self.new_tag('head')
     html.append(head)
     title = self.new_tag('title')
     title.string = 'TravellingKleinanzeigenProblem'
     head.append(title)
     self.body = self.new_tag('body')
     html.append(self.body)
Exemple #8
0
    def clean_text(self):
        text = self.cleaned_data["text"]

        soup = BeautifulSoup(text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        imgid = 0
        for img in soup.findAll("img"):
            img["id"] = "img%s" % imgid
            imgid += 1

        return str(soup)
Exemple #9
0
def generateCData(htmlIn):
    """
    This returns noteProps, and html soup to be reduced to a CData string.
    Meta-ToDo: Maybe this should be a method in a class with methods to handle different resource types?
    ToDo: This'll work for img tags but we're going to have to make it more general to handle resources other than images.
    ToDo: It'd be good to replace any <a href="evernote:///view..."s with URLs that Joplin can make something of.
    ToDo: remember to strip out disallowed tags and/or attributes.
    ToDo: return title as a string instead of an item in noteProps?
    """
    basepath, htmlfilename = os.path.split(htmlIn.rstrip(os.path.sep))
    soup = BeautifulSoup(open(htmlIn, 'r'), 'xml')
    noteProps = {'note-title': soup.find('title').text}
    for img in soup.find_all("img"):
        # assemble image properties
        img_path = basepath + img['src'][1:]
        pic = Image.open(img_path)
        width, height = pic.size
        imghash, base64block = file_to_base64(img_path)
        mimetype = guess_type(img_path)

        # make <en-media/> tag and replace img tag with it
        enmedia = Tag(name="en-media",
                      attrs={
                          'hash': imghash,
                          'type': mimetype
                      })
        img.replaceWith(enmedia)

        # generate entry in noteProps
        noteProps[imghash] = {
            'filename': img['src'][2:],
            'path': img_path,
            'type': mimetype,
            'width': str(width),
            'height': str(height),
            'data': base64block
        }

    # ToDo: more soup tinkering to create the CData string
    for t in soup:  # do I need this loop?
        if isinstance(t, Doctype):
            t.replaceWith(
                Doctype(
                    'en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"'))
    soup.html.unwrap()
    soup.head.decompose()
    soup.body.name = "en-note"

    return soup, noteProps
Exemple #10
0
def get_new_doc(args):
    doc = BeautifulSoup()
    doc.append(Doctype('html'))
    html_local = doc.new_tag('html', lang='en-US')
    doc.append(html_local)
    head = doc.new_tag('head')
    html_local.append(head)
    meta = doc.new_tag('meta', charset='utf-8')
    head.append(meta)
    title = doc.new_tag('title')
    title.string = get_title(args)
    head.append(title)
    body = doc.new_tag('body')
    html_local.append(body)
    return doc, html_local, head, body
Exemple #11
0
    def save(self, commit=True):
        m = super(MailWithAttachmentForm, self).save(commit=False)

        soup = BeautifulSoup(m.text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        m.text = str(soup)

        m.template_type = "2"

        if commit:
            m.save()

        return m
Exemple #12
0
def create_note(note_data, soup):
    """Create an ENEX note element"""

    note = soup.new_tag('note')

    title = soup.new_tag('title')
    title.string = note_data.title
    note.append(title)

    content_inside = BeautifulSoup(features="xml")
    content_inside.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"'))

    content_inside_note = soup.new_tag('en-note')
    content_inside_note.string = note_data.content
    content_inside.append(content_inside_note)

    # Holy crap this is super hacky and horrible but I don't want to fight with
    # BeautifulSoup to make it not convert all the text to HTML entities, so
    # manually convert everything to < and >
    content_inside_str = str(content_inside).replace('&lt;', '<').replace('&gt;', '>')

    content = soup.new_tag('content')
    content.string = CData(content_inside_str)
    note.append(content)

    created = soup.new_tag('created')
    created.string = str(note_data.created)
    note.append(created)

    updated = soup.new_tag('updated')
    updated.string = str(note_data.updated)
    note.append(updated)

    for single_tag in note_data.tags:
        if single_tag is not None:
            tag = soup.new_tag('tag')
            tag.string = single_tag
            note.append(tag)

    attributes = soup.new_tag('note-attributes')
    author = soup.new_tag('author')
    author.string = "Andrew Heiss"

    attributes.append(author)
    note.append(attributes)

    return note
Exemple #13
0
    def clean_text(self):
        text = self.cleaned_data["text"]

        soup = BeautifulSoup(text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        forms = soup.findAll("form")
        if not forms or len(forms) > 1:
            raise ValidationError(_("The template must contain one form"))

        if not forms[0].findAll("input", attrs={"type": "submit"}):
            raise ValidationError(_("The form must have a submit button"))

        imgid = 0
        for img in soup.findAll("img"):
            img["id"] = "img%s" % imgid
            imgid += 1

        return str(soup)
Exemple #14
0
def generate_enex():
    # Note data structure
    Note = namedtuple('Note', ['title', 'content', 'created', 'updated', 'tags'])

    # Generate empty XML document
    soup = BeautifulSoup(features="xml")
    soup.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"'))

    # Everything is wrapped in <en-export>
    root_tag = soup.new_tag("en-export")

    # Parse each note
    original_notes = glob.glob(note_files)

    for original_note in original_notes:
        title = os.path.basename(os.path.splitext(original_note)[0])

        with open(original_note, 'r') as f:
            text = f.read()

            content = markdown.markdown(text,
                                        extensions=[GithubFlavoredMarkdownExtension()])

        fileinfo = os.stat(original_note)
        created = time.strftime('%Y%m%dT%H%M%SZ', time.gmtime(fileinfo.st_birthtime))
        modified = time.strftime('%Y%m%dT%H%M%SZ', time.gmtime(fileinfo.st_mtime))

        tags = extract_tags(original_note)

        parsed_note = Note(title, content, created, modified, tags)

        # Append to <en-export> element
        root_tag.append(create_note(parsed_note, soup))

    # Append <en-export> to the empty XML document
    soup.append(root_tag)

    with open(out_file, 'w') as f:
        f.write(str(soup))
def add_tags(content, has_mathjax):
    # Add html, head tag
    html = bs(str(content), "html5lib")
    doctype = Doctype('html')
    html.insert(0, doctype)
    head = html.head

    # Add style tag
    head.append(html.new_tag('style', type='text/css'))
    # Applying font
    head.style.append(
        '*{font-family: Arial, Helvetica, sans-serif !important;}')

    # For rendering maths equation
    if has_mathjax:
        # Polyfill
        polyfill = html.new_tag(
            'script',
            src="https://polyfill.io/v3/polyfill.min.js?features=es6")
        head.append(polyfill)

        # MathJax
        mathjax_actions = Path(__file__).parent.joinpath("mathjax-actions.js")
        head.append(
            html.new_tag('script',
                         src=f'file:///{mathjax_actions.as_posix()}'))
        conf = html.new_tag('script', type="text/x-mathjax-config")
        conf.append("MathJax.Hub.Config({CommonHTML: {scale: 200}});")
        head.append(conf)
        head.append(
            html.new_tag(
                'script',
                id="MathJax-script",
                attrs={'async': ''},
                src=
                "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.7/MathJax.js?config=TeX-MML-AM_CHTML"
            ))

    return html.prettify(), has_mathjax
 def to_plist(self):
     def prettify(soup):
         return str(soup).replace("<array>", "\n    <array>").replace("<dict>", "\n        <dict>").replace("<key>", "\n            <key>").replace("<string>", "\n            <string>").replace("</dict>", "\n        </dict>").replace("</array>", "\n    </array>").replace("</plist>", "\n</plist>")
     soup = BeautifulSoup('', 'xml')
     doctype = Doctype('plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"')
     soup.append(doctype)
     soup.append(soup.new_tag('plist', version="1.0"))
     soup.plist.append(soup.new_tag('array'))
     for itm in self.phrases:
         dct = soup.new_tag('dict')
         phrase_key = soup.new_tag('key')
         phrase_key.string = "phrase"
         phrase_string = soup.new_tag('string')
         phrase_string.string = itm['phrase']
         shortcut_key = soup.new_tag('key')
         shortcut_key.string = "shortcut"
         shortcut_string = soup.new_tag('string')
         shortcut_string.string = itm['shortcut']
         dct.append(phrase_key)
         dct.append(phrase_string)
         dct.append(shortcut_key)
         dct.append(shortcut_string)
         soup.array.append(dct)
     return prettify(soup)
Exemple #17
0
# soup = BeautifulSoup(markup, "lxml")

# tag = soup.a
# tag.string = "New link text."

# print tag
# # <a href="http://example.com/">New link text.</a>

# # /////////
# # append()
# # /////////

# soup = BeautifulSoup("<a>Foo</a>", "lxml")
# soup.a.append("Bar")

# print soup.prettify()
# # <html><body><a>FooBar</a></body></html>

# print soup.a.contents
# # [u'Foo', u'Bar']

soup = BeautifulSoup(
    '<!doctype html><html lang="en"><head><meta charset="utf-8" /> <title>Your Book Title</title><link rel="stylesheet" href="style.css"  type="text/css" /></head><body></body>',
    'lxml')
print soup.prettify()
print soup.html
print soup.body
print soup.div

print Doctype("html")
Exemple #18
0
    "https://banter-latte.com/2007/07/17/interviewing-leather-part-four/",
    "https://banter-latte.com/2007/07/24/interviewing-leather-part-five/",
    "https://banter-latte.com/2007/07/31/interviewing-leather-part-six/",
    "https://banter-latte.com/2007/08/07/interviewing-leather-part-seven/",
    "https://banter-latte.com/2007/08/14/interviewing-leather-part-eight/",
    "https://banter-latte.com/2007/08/21/interviewing-leather-part-nine/",
    "https://banter-latte.com/2007/08/28/interviewing-leather-part-ten/",
    "https://banter-latte.com/2007/09/04/interviewing-leather-part-eleven/",
    "https://banter-latte.com/2007/09/20/interviewing-leather-part-twelve/",
    "https://banter-latte.com/2007/09/25/interviewing-leather-part-thirteen/",
    "https://banter-latte.com/2007/10/02/interviewing-leather-part-fourteen/"
]

# Construct HTML skeleton
doc = BeautifulSoup()
doc.append(Doctype('html'))
html = doc.new_tag('html', lang='en-US')
doc.append(html)
head = doc.new_tag('head')
html.append(head)
meta = doc.new_tag('meta', charset='utf-8')
head.append(meta)
title = doc.new_tag('title')
title.string = 'Interviewing Leather'
head.append(title)
body = doc.new_tag('body')
html.append(body)

# Gather each chapter's content
for i, chapter in enumerate(chapters):
    # Construct h1 for the chapter
Exemple #19
0
def main(input_base_path: str, output_base_path: str) -> None:
    # Process each .html file
    input_base_path = input_base_path + '/'
    original_files = glob.glob(input_base_path + '*.html')
    output_directory = Path(output_base_path)

    try:
        output_directory.mkdir()
    except:
        pass

    for _original_file_as_str in original_files:

        original_file = Path(_original_file_as_str)
        original_full_filename = input_base_path + original_file.name
        soup = load_soup_file(original_full_filename)

        # META charset
        # Delete any that exist.  Put in a correct one.
        charset_metas = get_charset_metas(soup)
        for _cs in charset_metas:
            _cs.decompose()
        new_meta = soup.new_tag('meta')
        new_meta.attrs['http-equiv'] = "content-type"
        new_meta.attrs['content'] = "text/html; charset=UTF-8"
        newline = NavigableString('\n')
        soup.head.insert(0, new_meta)
        soup.head.insert(0, newline)

        # META Viewports
        # Leave alone if already one there.  Otherwise, put in a correct one.
        viewport_metas = get_viewport_metas(soup)
        if not viewport_metas:
            new_meta = soup.new_tag('meta')
            new_meta.attrs['name'] = 'viewport'
            new_meta.attrs['content'] = "width=device-width, initial-scale=1.0"
            newline = NavigableString('\n')
            soup.head.insert(0, new_meta)
            soup.head.insert(0, newline)

        # META robots
        # If one exists, leave it alone.  Otherwise add one
        robots = soup.find_all('meta', {'name': "robots"})
        if not robots:
            new_meta = soup.new_tag('meta')
            new_meta.attrs['name'] = "robots"
            new_meta.attrs['content'] = "index,follow"
            newline = NavigableString('\n')
            soup.head.insert(0, new_meta)
            soup.head.insert(0, newline)

        # Delete: <meta content="OpenOffice.org 3.3 (Win32)" name="GENERATOR"/>
        open_office = soup.find_all('meta', {
            'content': "OpenOffice.org 3.3 (Win32)",
            'name': "GENERATOR"
        })
        for tag in open_office:
            tag.decompose()

        # Delete any empty titles
        for tag in soup.find_all('title'):
            if not tag.contents:
                tag.decompose()

        # Remove any <font></font> that is identical to its parent
        for tag in soup.find_all('font'):
            if tag.parent.name == tag.name:
                if tag.parent.attrs.keys() == tag.attrs.keys():
                    tag.parent.unwrap()

        # Remove any <big></big> that is identical to its parent
        for tag in soup.find_all('big'):
            if tag.parent.name == tag.name:
                if tag.parent.attrs.keys() == tag.attrs.keys():
                    tag.parent.unwrap()

        # Modify freepages links to local if possible.
        # Is it from rootsweb?  Does it have the same filename locally?
        for tag in soup.find_all('a'):
            try:
                parsed = urlparse(tag['href'])
            except:
                print(f"Strange tag: {tag}\n")
                continue
            if parsed.netloc != 'freepages.genealogy.rootsweb.ancestry.com':
                continue
            filename = Path(parsed.path).name
            full_filename = Path(input_base_path + filename)
            if full_filename.is_file():
                # Everything is OK to change link.
                tag['href'] = filename

        # Clean spaces inside of <a> </a> strings
        for tag in soup.find_all('a'):
            if len(tag.contents) == 1:
                tag.string = html_text_spaces_clean(tag.string)

        # Add lang="en" to <html>
        for tag in soup.find_all('html'):
            tag['lang'] = 'en'

        # Remove: <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
        for item in soup.contents:
            if isinstance(item, Doctype):
                item.extract()

        # Add <!DOCTYPE html> to top of file
        tag = Doctype('html')
        soup.insert(0, tag)
        # Remove any added space just below doctype html
        if soup.contents[1].string == '\n':
            soup.contents[1].extract()

        # remove spaces around text several tags:
        for tag in soup.find_all(['title', 'h1', 'h2', 'h3', 'h4']):
            if len(tag.contents) == 1:
                tag.string = html_text_spaces_clean(tag.string)

        # <head> reduce blank lines
        for tag in soup.head.contents[1:]:
            if tag.string == '\n' and tag.previous_sibling.string == '\n':
                tag.extract()

        # <head> indent each tag inside 4 spaces
        # Note the two passes.  Its required because inserting in .contents
        # while looping will cause an infinite loop.
        head_tags = list()
        for tag in soup.head.contents:
            if tag.string == '\n':
                continue
            head_tags.append(tag)
        for tag in head_tags:
            indent = NavigableString('    ')
            tag.insert_before(indent)

        # Add a newline after </head> for .html readability
        newline = NavigableString('\n')
        soup.head.insert_after(newline)

        # fix html checker:
        # "The type attribute for the style element is not needed and should be omitted.""
        for tag in soup.find_all('style'):
            del tag['type']

        ## Table Cleaning

        # <td> </td> try to keep to one line
        for tag in soup.find_all('td'):
            if len(tag.contents) > 1:
                if tag.contents[-1].string == '\n':
                    tag.contents[-1].extract()
            if len(tag.contents) > 1:
                if tag.contents[0].string == '\n':
                    tag.contents[0].extract()

        print("At end")
        #embed()

        # ! Remember to use soup.prettify() ONLY for visuals.  Always write as str(soup)
        # ! Otherwise whitespace and other formatting is gone.
        output_file = Path(os.path.join(output_directory, original_file.name))
        with open(output_file, 'w') as fp:
            fp.write(str(soup))
Exemple #20
0
def main(input_base_path:str, output_base_path:str) -> None:
    # Process each .html file
    input_base_path = input_base_path + '/'
    original_files = glob.glob(input_base_path + '*.html')
    output_directory = Path(output_base_path)

    try:
        output_directory.mkdir()
    except:
        pass

    for _original_file_as_str in original_files:

        original_file = Path(_original_file_as_str)
        original_full_filename = input_base_path + original_file.name

        soup = load_soup_file(original_full_filename)

        # META charset
        # Delete any that exist.  Put in a correct one.
        charset_metas = get_charset_metas(soup)
        for _cs in charset_metas:
            _cs.decompose()
        new_meta = soup.new_tag('meta')
        new_meta.attrs['http-equiv'] = "content-type"
        new_meta.attrs['content'] = "text/html; charset=UTF-8"
        newline = NavigableString('\n')
        soup.head.insert(0, new_meta)
        soup.head.insert(0, newline)

        # META Viewports
        # Leave alone if already one there.  Otherwise, put in a correct one.
        viewport_metas = get_viewport_metas(soup)
        if not viewport_metas:
            new_meta = soup.new_tag('meta')
            new_meta.attrs['name'] = 'viewport'
            new_meta.attrs['content'] = "width=device-width, initial-scale=1.0"
            newline = NavigableString('\n')
            soup.head.insert(0, new_meta)
            soup.head.insert(0, newline)

        # Delete: <meta content="OpenOffice.org 3.3 (Win32)" name="GENERATOR"/>
        open_office = soup.find_all('meta', {
            'content':"OpenOffice.org 3.3 (Win32)",
            'name': "GENERATOR"
        })
        for tag in open_office:
            tag.decompose()

        # Add lang="en" to <html>
        for tag in soup.find_all('html'):
            tag['lang'] = 'en'

        # Remove: <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
        for item in soup.contents:
            if isinstance(item, Doctype):
                item.extract()

        # Add <!DOCTYPE html> to top of file
        tag = Doctype('html')
        soup.insert(0, tag)
        # Remove any added space just below doctype html
        if soup.contents[1].string == '\n':
            soup.contents[1].extract()


        # Run through tidy
        html, errors = tidy_document(
            str(soup),
            options= {
                "indent": 1,              # Pretty; not too much of a performance hit
                "tidy-mark": 0,           # No tidy meta tag in output
                "doctype": 'html5',
                "drop-empty-elements": 0,
                "drop-empty-paras": 0,
                "add-meta-charset": 1,
                "logical-emphasis": 1,
                "preserve-entities": 1,
                "literal-attributes": 1,
                "priority-attributes": "name,content,rel,href",
                "wrap": 80
            })

        # embed()

        output_file = Path(os.path.join(output_directory, original_file.name))
        with open(output_file, 'w') as fp:
            fp.write(html)
Exemple #21
0
from sys import argv as args,exit
from bs4 import BeautifulSoup,Doctype

if __name__ == "__main__":
	if not len(args) == 3:
		exit(1)
	with open(args[1],'r') as inputFile:
		soup = BeautifulSoup(inputFile,"lxml")
		html = BeautifulSoup("","html5lib")
		html.contents.insert(0,Doctype("html"))
		html.contents[1]["lang"]="ja"

		meta = html.new_tag("meta")
		meta["charset"]="UTF-8"
		html.contents[1].head.contents.append(meta)

		script = html.new_tag("script")
		script["type"]="text/javascript"
		script["src"]="js/functions.js"
		html.contents[1].head.contents.append(script)

		svg = soup.find("svg")

		html.contents[1].body.contents.append(svg)

		nodes = svg.find_all("g",class_="node")
		for node in nodes:
			node["onclick"]="onClick(this);"
		with open(args[2],'w') as outputFile:
			outputFile.write(str(html)+"\n")
Exemple #22
0
def process_chapter(chapter: str) -> BeautifulSoup:
    # Trim navpanels
    while NAVPANEL_START in chapter:
        chapter = delete_delimited_chunk(chapter, NAVPANEL_START, NAVPANEL_END)

    # We want to remove the sub-chapter links unless it's the intro page
    if IRBOOK_MARKER not in chapter:
        while CHILD_LINKS_START in chapter:
            chapter = delete_delimited_chunk(chapter, CHILD_LINKS_START,
                                             CHILD_LINKS_END)

    chapter_soup = soups(chapter)

    # Delete elements that epub doesn't like
    for el in chapter_soup.find_all('meta'):
        el.decompose()

    # Delete the "autogenerated page" footer
    address = chapter_soup.find('address')
    if address:
        address.decompose()

    # The link 404's anyways
    css_link = chapter_soup.find(
        'link',
        href='https://nlp.stanford.edu/IR-book/html/htmledition/irbook.css')
    if css_link:
        css_link.decompose()

    # Who puts a <br> in a header??!
    h1 = chapter_soup.find('h1')
    if h1:
        br = h1.find('br')
        if br:
            br.decompose()

    for s in chapter_soup.find_all(text=re.compile("[`']")):
        if type(s) in [
                Comment, bs4.Doctype, bs4.ProcessingInstruction,
                bs4.Declaration
        ]:
            continue
        old_s = str(s)
        new_s = (old_s.replace('``', '“').replace("''", '”').replace(
            "'", '’').replace("`", '‘'))
        s.replace_with(new_s)

    for img in chapter_soup.find_all('img'):
        alt = img['alt']
        if '...' in alt:
            # they literally deleted half the source i'd need to correctly
            # reproduce the larger figures...
            alt = expand_ellipsized(img)
            if alt is None:
                continue
        if alt.endswith('.html'):
            # wtf are you doing
            continue
        if r'\includegraphics' in alt:
            continue

        # cross-reference symbol
        if alt == '[*]':
            img.parent.string = '‡'
            continue

        mathml = trivial_tex_to_mathml(alt)
        if mathml is None:
            # Otherwise, give SnuggleTeX a try:
            try:
                mathml = tex_to_mathml(alt)
            except TeXRenderError as e:
                e.context = img.parent
                raise

        img.replace_with(soups(mathml, 'html.parser'))

    # delete empty children at the end of the docment
    annoying_tag_names = ['hr', 'p', 'br']
    empty_tags = ['hr', 'br']
    children = list(chapter_soup.body.find_all(True))
    if children:
        for last_child in reversed(children):
            if last_child.name in annoying_tag_names:
                if last_child.name in empty_tags:
                    last_child.decompose()
                    continue

                if last_child.string and last_child.string.strip():
                    break
                else:
                    last_child.decompose()
            else:
                break

    # this prevents a file from being invalid xhtml lol
    for bad_a in chapter_soup.find_all('a'):
        if bad_a.has_attr('wikipedia:general'):
            del bad_a['wikipedia:general']
            break

    # # We're renaming everything to .xhtml
    # for el in itertools.chain(chapter_soup.find_all('link'),
    #                           chapter_soup.find_all('a')):
    #     if el.has_attr('href') and not el['href'].startswith('http'):
    #         el['href'] = el['href'].replace('.html', '.xhtml')

    # Get rid of naughty attributes
    for el in chapter_soup.find_all(True):
        el['class'] = ''
        for attr in [
                'align', 'valign', 'cellpadding', 'border', 'nowrap', 'compact'
        ]:
            if el.has_attr(attr):
                el['class'] += attr + '-' + el[attr].lower()
                del el[attr]

        if el.has_attr('width'):
            if el['width'] == '100%':
                el['class'] += ' full-width'
            else:
                el['class'] += ' width-' + el['width']
            del el['width']

        if not el['class']:
            del el['class']

    for el in chapter_soup.find_all('br'):
        if el.has_attr('clear'):
            del el['clear']

    for el in chapter_soup.find_all('tt'):
        el.name = 'code'

    chapter_soup.html.head.append(chapter_soup.new_tag('meta',
                                                       charset='utf-8'))
    chapter_soup.html.head.append(
        chapter_soup.new_tag(
            'link',
            rel='stylesheet',
            type='text/css',
            href='Styles/book.css',
        ))

    first_child = next(chapter_soup.children)
    if isinstance(first_child, Doctype):
        # XHTML has no doctype, and Doctype has no decompose method
        # first_child.replace_with(soups('<!DOCTYPE html>'))
        first_child.replace_with(Doctype('html'))

    for el in chapter_soup.children:
        if isinstance(el,
                      Comment) and el.startswith(CONVERSION_COMMENT_MARKER):
            el.replace_with('')
            break

    # chapter_soup.find('html')['xmlns:epub'] = 'http://www.idpf.org/2007/ops'

    # chapter_soup.smooth()
    return chapter_soup
Exemple #23
0
    def clean_text(self):
        text = self.cleaned_data["text"]
        timeout = self.data["timeout"]

        javascript = """
        function fullscreen_display() {
                var images = document.getElementsByTagName('img');
                for (i = 0; i < images.length;i++ ) {
                    images[i].style.display = "none";
                }
                var divs = document.getElementsByTagName('div');
                for (i = 0; i < divs.length;i++ ) {
                    var images = divs[i].getElementsByTagName('img');
                    for (j = 0; j < images.length;j++ ) {
                        images[j].style.display = "block";
                        images[j].style.marginLeft = "auto";
                        images[j].style.marginRight = "auto";
                    }
                    divs[i].style.display = "block";
                }
            }
        function launchIntoFullscreen(element) {
                if(element.requestFullscreen) {
                    element.requestFullscreen();
                }
                else if(element.mozRequestFullScreen) {
                    element.mozRequestFullScreen();
                } else if(element.webkitRequestFullscreen) {
                    element.webkitRequestFullscreen();
                } else if(element.msRequestFullscreen) {
                    element.msRequestFullscreen();
                }
                    fullscreen_display();
                    window.setTimeout(function() {
                    location.href = "FIXMEURL";
                }, %s);
        }
    """ % (int(timeout) * 1000)

        soup = BeautifulSoup(text, "html.parser")
        if not isinstance(soup.contents[0], Doctype):
            doctype = Doctype("html")
            soup.insert(0, doctype)

        if "style" in soup.body.attrs:
            soup.html.attrs["style"] = soup.body.attrs["style"]

        script = soup.new_tag("script")
        script.attrs["type"] = "text/javascript"
        script.append(javascript)

        jquery = soup.new_tag("script")
        jquery.attrs["type"] = "text/javascript"
        jquery.attrs["src"] = "https://code.jquery.com/jquery.min.js"

        jqlist = soup.head.findAll(
            "script", {"src": "https://code.jquery.com/jquery.min.js"})
        if not jqlist:
            soup.head.append(jquery)

        for div in soup.body.findAll("div"):
            if "style" in div.attrs:
                div.attrs["style"] = "%s; %s;" % (div.attrs["style"],
                                                  "display:none")
            else:
                div.attrs["style"] = "display:none"

        scriptlist = soup.head.findAll("script", string="launchIntoFullscreen")
        if not scriptlist:
            soup.head.append(script)

        if not soup.body.findAll("img"):
            raise ValidationError(_("The form must have at least one image"))

        imgid = 0
        for img in soup.findAll("img"):
            img["onclick"] = "javascript:launchIntoFullscreen(document.documentElement);"
            img["id"] = "img%s" % imgid
            imgid += 1

        return str(soup)
Exemple #24
0
    def html_makepage(self, plot_title=None, plot_notes=None):
        """
        Generate HTML document from scratch with plot image and store it in self.html_page
        - plot_title: (string) alternative title for the plot
        - plot_notes: (list of string) optional text to add below plot_title
        """
        # Path to image file of the plot
        # Use SVG file for better scaling quality
        try:
            img_source_path = self.output_path['svg']
        except AttributeError as err:
            errmsg = f"Path to plot render for HTML page not found. Method self.set_output_paths() not called yet."
            error_exit(self.log, errmsg)

        # Main titles
        page_title = self.title

        if self.datelim and plot_title is None:
            plot_title = "Graph from {} to {}".format(*self.datelim)

        # Head and title of the HTML page
        head = "<head><meta /><title>{}</title></head>".format(page_title)
        page = BeautifulSoup(''.join(head), 'lxml')
        page.insert(0, Doctype('html'))
        page.html['lang'] = 'en'
        page.head.meta['charset'] = 'utf-8'

        # CSS style: take from file defined in configuration
        html_css_file = MainConf.get('reports', 'html_main_cssfile', fallback='html_main_style.html', mandatory=False)
        css_style = DataFile(html_css_file, mandatory=True).contents
        page.head.append(css_style.find('style'))
        self.log.debug(f"HTML page: added CSS style from file: {html_css_file}")

        # Body and main title
        newobj = page.html
        for tag in ['body', 'h1']:
            newtag = page.new_tag(tag)
            newobj.append(newtag)
            newobj = newobj.contents[-1]
        page.h1.string = page_title

        # Render plot in SVG format
        img_block = page.new_tag('div')
        img_block['class'] = 'blockcard'

        if plot_title is not None:
            img_block.append(page.new_tag('h2'))
            img_block.h2.string = plot_title
            self.log.debug("HTML page: plot sub-title added")

        if plot_notes is not None:
            if not isinstance(plot_notes, list):
                plot_notes = [plot_notes]
            for note in plot_notes:
                img_block.append(page.new_tag('p'))
                p_block = img_block.contents[-1]
                p_block.string = note
            self.log.debug("HTML page: %s notes added", len(plot_notes))

        img_block.append(page.new_tag('img'))
        img_block.img['class'] = 'plotrender'
        img_block.img['src'] = img_source_path
        img_block.img['alt'] = self.title
        page.body.append(img_block)
        self.log.info("HTML page: plot render '%s' added to report page", img_block.img['src'])

        # Render container for tables
        tables_block = page.new_tag('div')
        tables_block['id'] = 'tablescontainer'
        tables_block['class'] = 'blockcard'
        page.body.append(tables_block)

        self.html_page = page