Example #1
0
def save_note(media_cache, note, note_info, note_paths):
    notebook_slug = slugify(note_info.notebook.name)
    note_slug = slugify(unicode(note.title))

    soup = BeautifulSoup('<html><head><title>{}</title></head><body></body></html>'.format(note.title), 'html.parser')
    note_soup = BeautifulSoup(note.content, 'html.parser').find('en-note')
    note_soup.name = 'div'
    note_soup['class'] = 'note'
    soup.body.append(note_soup)

    meta_tags = {}

    def add_meta_tag(name, content):
        meta_tags[name] = content
        new_tag = soup.new_tag('meta', content=content)
        new_tag['name'] = name
        soup.head.append(new_tag)

    add_meta_tag('slug', note_slug)
    add_meta_tag('category', note_info.notebook.name)
    add_meta_tag('date', format_timestamp(note.created))
    add_meta_tag('modified', format_timestamp(note.updated))

    path_map = {
        'notebook': notebook_slug,
        'note': note_slug
    }
    content_path = note_paths.content.format(**path_map)
    html_path = note_paths.html.format(**path_map)
    file_path = note_paths.file.format(**path_map)

    replace_media_tags(media_cache, soup, note, note_info.store, html_path, file_path, add_meta_tag)

    if note.attributes.latitude is not None and note.attributes.longitude is not None:
        place = note_info.get_place(note.attributes.latitude, note.attributes.longitude)
        if place:
            add_meta_tag('city', place)
        else:
            add_meta_tag('latitude', note.attributes.latitude)
            add_meta_tag('longitude', note.attributes.longitude)

    tags = linkify_soup(soup, soup.new_tag)
    add_meta_tag('tags', u', '.join(tags))

    summary = get_summary(note_soup, 120)
    add_meta_tag('summary', summary)

    # Facebook OG tags
    add_meta_tag('og:title', note.title)
    add_meta_tag('og:site_name', 'avalarky report')
    add_meta_tag('og:description', summary)
    add_meta_tag('og:type', 'article')
    if 'hero_image' in meta_tags:
        add_meta_tag('og:image', meta_tags['hero_image'])

    if not path.exists(content_path):
        os.makedirs(content_path, mode=0755)

    with open(path.join(content_path, 'index.html'), 'w') as f:
        f.write(soup.prettify().encode('utf-8'))
Example #2
0
def transformColorizeCode(soup, cssClass, sourceLang):
	for elem in soup.select('pre.' + cssClass):
		input_str = elem.string
		if len(input_str) > 0 and input_str[0] == '\n':
			# hack for leading empty line
			input_str = input_str[1:]

		colorized = sourceHighlight(input_str, sourceLang)

		# source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt>
		new_elem = BeautifulSoup(colorized).tt    # XXX: parse just a fragment - how?
		new_elem.name = 'pre'
		new_elem['class'] = cssClass

		elem.replace_with(new_elem)
Example #3
0
def transformColorizeCode(soup, cssClass, sourceLang):
    for elem in soup.select('pre.' + cssClass):
        input_str = elem.string
        if len(input_str) > 0 and input_str[0] == '\n':
            # hack for leading empty line
            input_str = input_str[1:]

        colorized = sourceHighlight(input_str, sourceLang)

        # source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt>
        new_elem = BeautifulSoup(
            colorized).tt  # XXX: parse just a fragment - how?
        new_elem.name = 'pre'
        new_elem['class'] = cssClass

        elem.replace_with(new_elem)
Example #4
0
def transform_wikipage(page):
    page = page.replace("~CLEAR~", '<div style="clear: both;"></div>')
    page = page.replace("retroherna.cz", "retroherna.org")
    page = BeautifulSoup(page, "lxml")
    for a in page.find_all('a'):
        if a.get('href') and "/wiki/doku.php" in a['href']:
            a['href'] = a['href'].replace("/wiki/doku.php?id=web:",
                                          "/").replace(':', '/')

    for img in page.find_all('img'):
        if not img['src'].startswith("http"):
            img['src'] = img['src'].replace(
                "/wiki/lib/exe/fetch.php",
                "https://retroherna.org/wiki/lib/exe/fetch.php")
        title = img.get('title')

        parent = img.parent
        if parent.name == "a" and parent['href'].startswith("/wiki"):
            parent.name = "div"
            del parent['href']
        else:
            parent = page.new_tag("div")
            img.wrap(parent)
        classes = img.get('class')
        parent['class'] = classes + [" mediawrap"]
        if 'mediacenter' in classes and img.get('width'):
            # life is too short
            parent['style'] = 'width: {}px;'.format(img['width'])
        del img['class']

        # XXX yes this is necessary, thanks dokuwiki
        if title and not any(
                title.endswith(t) for t in ("png", "jpg", "jpeg", "gif")):
            title = page.new_tag("div")
            title['class'] = "mediatitle"
            if img.get('width'):
                title['style'] = "max-width: {}px;".format(img['width'])
            title.string = img['title']
            parent.append(title)

    page = page.html.body
    page.name = "section"

    #page = "".join(str(page))
    return page
Example #5
0
def transformColorizeCode(soup, cssClass, sourceLang):
    for elem in soup.select('pre.' + cssClass):
        input_str = elem.string
        if len(input_str) > 0 and input_str[0] == '\n':
            # hack for leading empty line
            input_str = input_str[1:]

        colorized = sourceHighlight(input_str, sourceLang)

        origTitle = elem.get('title', None)

        # source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt>
        new_elem = BeautifulSoup(colorized, 'lxml').tt    # XXX: parse just a fragment - how?
        new_elem.name = 'pre'
        new_elem['class'] = cssClass

        if origTitle is not None:
            # Preserve title (hover tool tip)
            new_elem['title'] = origTitle

        elem.replace_with(new_elem)
Example #6
0
def transformColorizeCode(soup, cssClass, sourceLang):
	for elem in soup.select('pre.' + cssClass):
		input_str = elem.string
		if len(input_str) > 0 and input_str[0] == '\n':
			# hack for leading empty line
			input_str = input_str[1:]

		colorized = sourceHighlight(input_str, sourceLang)

		origTitle = elem.get('title', None)

		# source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt>
		new_elem = BeautifulSoup(colorized, 'lxml').tt    # XXX: parse just a fragment - how?
		new_elem.name = 'pre'
		new_elem['class'] = cssClass

		if origTitle is not None:
			# Preserve title (hover tool tip)
			new_elem['title'] = origTitle

		elem.replace_with(new_elem)
Example #7
0
    def parse_content(self):

        title = self.driver.find_element_by_css_selector(
            'span#ai_cm_title').text
        body = BeautifulSoup(self.driver.page_source, 'html.parser')
        body = body.select('wrap_copy')[0]
        body.name = 'div'

        self.body_process(body)
        for div in body.select('div.comment_crop_href_mp4'):
            div.extract()

        for div in body.findAll('div',
                                id=lambda x: x and x.startswith('show_')):
            div.extract()

        body = str(body)

        item = Item(title=title, text=body, created_at=datetime.now())
        print(f'created {item.title}')
        self.driver.back()
        return item
Example #8
0
            outfile = open(join(postpath, getpagename(pager)),
                           "w+",
                           encoding="utf-8")

    body = BeautifulSoup(open(join(postpath, filename), "r", encoding="utf-8"),
                         "html.parser").body

    for image in body.find_all("img"):
        if checkfileattr(image, "media", "src"):
            if isfile(join(postpath, image["src"] + ".jpg")):
                image["src"] += ".jpg"
                image.parent["href"] += ".jpg"
            elif isfile(join(postpath, image["src"] + ".png")):
                image["src"] += ".png"
                image.parent["href"] += ".png"
    body.name = "div"
    sketch.body.append(body)
    postscount += 1
    if postscount == postsperpage:
        postscount = 0
        pager += 1

if postscount == 0:
    pager -= 1
if pager > 1:
    prevpage = sketch.new_tag("a", href=getpagename(pager - 1))
    prevpage.string = "Previous Page"
    sketch.body.append(prevpage)
outfile.write(sketch.prettify())
outfile.close()
print("The generated HTML file(s) are saved as " +
Example #9
0
def make_top_level_tag_body(story: BeautifulSoup):
    story.name = "body"
    del story['class']
def create_transcript(url_transcript, url_image, page_nb, path_to_doc,
                      document_title, document_desc, document_lang):
    """Create xml file containing a page transcript from Transkribus, in PAGE standard. File is name after
    corresponding page number.

    :param url_transcript: url to request transcript, provided by Transkribus.
    :param url_image: url to request image of current document/subcollection page.
    :param page_nb: current page number.
    :param path_to_doc: path to directory for the current document/subcollection.
    :param document_title: title of current document/subcollection.
    :param document_desc: description of current document/subcollection.
    :param document_lang: list of languages for current document/subcollection, separated by commas.
    :type url_transcript: string
    :type url_image: string
    :type page_nb: integer
    :type path_to_doc: string
    :type document_title: string
    :type document_desc: string
    :type document_lang: string
    :return: a status to signal possible server errors.
    :rtype: boolean
    """
    response = requests.request("GET", url_transcript)
    document_title = "<title>%s</title>" % document_title
    document_desc = "<desc>%s</desc>" % document_desc
    document_page_nb = "<pagenumber>%s</pagenumber>" % page_nb
    tag_title = BeautifulSoup(document_title, "xml")
    tag_desc = BeautifulSoup(document_desc, "xml")
    tag_page_nb = BeautifulSoup(document_page_nb, "xml")
    tag_title = tag_title.title.extract()
    tag_desc = tag_desc.desc.extract()
    tag_page_nb = tag_page_nb.pagenumber.extract()
    tag_title.name = "tu:title"
    tag_desc.name = "tu:desc"
    tag_page_nb.name = "tu:pagenumber"
    if len(document_lang) != 0:
        document_lang = ''.join([
            "<language>%s</language>" % l.strip()
            for l in document_lang.split(",")
        ])
        document_lang = "<languages>%s</languages>" % document_lang
        tag_languages = BeautifulSoup(document_lang, "xml")
        tag_languages = tag_languages.languages.extract()
        tag_lang_list = tag_languages.findAll("language")
        for tag in tag_lang_list:
            tag.name = "tu:language"

    if response.status_code == 503:
        error = True
    else:
        error = False
        xml = response.text
        path_to_file = os.path.join(path_to_doc, "%s.xml") % page_nb
        soup = BeautifulSoup(xml, "xml")
        # Adding namespace declaration for element added by Time Us project
        # Adding attributes to Page elements : @timeUs:url and @timeUs:id
        if soup.PcGts:
            soup.PcGts["xmlns:tu"] = "timeUs"
            soup.Page["tu:url"] = url_image
            soup.Page["tu:id"] = page_nb
            soup.Metadata.append(tag_title)
            soup.Metadata.append(tag_desc)
            soup.Metadata.append(tag_page_nb)
            if len(document_lang) != 0:
                for tag in tag_lang_list:
                    soup.Metadata.append(tag)
            with open(path_to_file, "w") as f:
                f.write(str(soup))
    return error
Example #11
0
                                            #Download the corresponding image
                                            path_to_img = os.path.join(
                                                path_to_doc_dir,
                                                "%s") % (page["imgFileName"])
                                            response = requests.get(
                                                page_url_img)
                                            with open(path_to_img,
                                                      'wb') as file:
                                                file.write(response.content)

                                            t_title = "<title>%s</title>" % doc_title
                                            tag_title = BeautifulSoup(
                                                t_title, "xml")
                                            tag_title = tag_title.title.extract(
                                            )
                                            tag_title.name = "temp:title"

                                            t_uploader = "<uploader>%s</uploader>" % doc_uploader
                                            tag_uploader = BeautifulSoup(
                                                t_uploader, "xml")
                                            tag_uploader = tag_uploader.uploader.extract(
                                            )
                                            tag_uploader.name = "temp:uploader"

                                            t_desc = "<desc>%s</desc>" % doc_desc
                                            tag_desc = BeautifulSoup(
                                                t_desc, "xml")
                                            tag_desc = tag_desc.desc.extract()
                                            tag_desc.name = "temp:desc"

                                            t_nb = "<pagenumber>%s</pagenumber>" % page_nb
Example #12
0
def func1():
    try:
        headers = {
            'Host': "www.mzitu.com",
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
            'Accept-Encoding': 'gzip, deflate',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Pragma': 'no-cache',
            'Cache-Control': 'no-cache',
        }
        #proxy = {"http":"10.10.1.10:3128", "https":"10.10.1.10:1080"}

        html = requests.get(
            "http://www.mzitu.com/zipai/comment-page-1/#comments",
            headers=headers)
        #html = request.urlopen(r)
        print(html.text)
        html_doc = """
        <html><head><title>The Dormouse's story</title></head>
        <body>
        <p class="title"><b>The Dormouse's story</b></p>
        
        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.</p>
        
        <p class="story">...</p>
        """
        other_doc = str(html.text)
        print(type(other_doc))
        soup = BeautifulSoup(html, "html.parser")
        print(soup.prettify())
        print(soup.title)
        print(soup.title.name)
        print(soup.p)
        print(soup.title.string)
        print(soup.find_all("a"))
        print(soup.find(id="link3"))
        for link in soup.find_all("a"):
            print(type(link))
            print(link["href"])
        print(type(soup.a["href"]))
        print(soup.get_text)
        print(soup.getText())
        tag = soup.find("a")
        tag.name = "bee"
        print(soup.getText())
        print(soup.get_text)
        print(type(soup.title.string))
        # print(unicode(soup.title.string))
        soup.name = "bee"
        print(soup.name)
    except ConnectionError as a:
        return
    except requests.HTTPError as b:
        return
    except TimeoutError as c:
        return
    except requests.TooManyRedirects as d:
        return
    except requests.exceptions.RequestException as e:
        print(e.args[0])
        return
    else:
        print("没有异常")
    finally:
        print("完成")
    return
Example #13
-1
 def popover(number, note):
     # we use BeautifulSoup to fix broken markup, e.g. incomplete span tags.
     note = BeautifulSoup(normalize_whitespace(note)).find('body')
     note.name = 'div'
     a = new_tag(
         soup,
         'a',
         new_tag(soup, 'sup', number),
         **{
             'style': 'text-decoration: underline; cursor: pointer;',
             'class': 'popover-note',
             'data-original-title': 'Note %s' % number,
             'data-content': unicode(note),
             })
     return unicode(a)