def save_note(media_cache, note, note_info, note_paths): notebook_slug = slugify(note_info.notebook.name) note_slug = slugify(unicode(note.title)) soup = BeautifulSoup('<html><head><title>{}</title></head><body></body></html>'.format(note.title), 'html.parser') note_soup = BeautifulSoup(note.content, 'html.parser').find('en-note') note_soup.name = 'div' note_soup['class'] = 'note' soup.body.append(note_soup) meta_tags = {} def add_meta_tag(name, content): meta_tags[name] = content new_tag = soup.new_tag('meta', content=content) new_tag['name'] = name soup.head.append(new_tag) add_meta_tag('slug', note_slug) add_meta_tag('category', note_info.notebook.name) add_meta_tag('date', format_timestamp(note.created)) add_meta_tag('modified', format_timestamp(note.updated)) path_map = { 'notebook': notebook_slug, 'note': note_slug } content_path = note_paths.content.format(**path_map) html_path = note_paths.html.format(**path_map) file_path = note_paths.file.format(**path_map) replace_media_tags(media_cache, soup, note, note_info.store, html_path, file_path, add_meta_tag) if note.attributes.latitude is not None and note.attributes.longitude is not None: place = note_info.get_place(note.attributes.latitude, note.attributes.longitude) if place: add_meta_tag('city', place) else: add_meta_tag('latitude', note.attributes.latitude) add_meta_tag('longitude', note.attributes.longitude) tags = linkify_soup(soup, soup.new_tag) add_meta_tag('tags', u', '.join(tags)) summary = get_summary(note_soup, 120) add_meta_tag('summary', summary) # Facebook OG tags add_meta_tag('og:title', note.title) add_meta_tag('og:site_name', 'avalarky report') add_meta_tag('og:description', summary) add_meta_tag('og:type', 'article') if 'hero_image' in meta_tags: add_meta_tag('og:image', meta_tags['hero_image']) if not path.exists(content_path): os.makedirs(content_path, mode=0755) with open(path.join(content_path, 'index.html'), 'w') as f: f.write(soup.prettify().encode('utf-8'))
def transformColorizeCode(soup, cssClass, sourceLang): for elem in soup.select('pre.' + cssClass): input_str = elem.string if len(input_str) > 0 and input_str[0] == '\n': # hack for leading empty line input_str = input_str[1:] colorized = sourceHighlight(input_str, sourceLang) # source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt> new_elem = BeautifulSoup(colorized).tt # XXX: parse just a fragment - how? new_elem.name = 'pre' new_elem['class'] = cssClass elem.replace_with(new_elem)
def transformColorizeCode(soup, cssClass, sourceLang): for elem in soup.select('pre.' + cssClass): input_str = elem.string if len(input_str) > 0 and input_str[0] == '\n': # hack for leading empty line input_str = input_str[1:] colorized = sourceHighlight(input_str, sourceLang) # source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt> new_elem = BeautifulSoup( colorized).tt # XXX: parse just a fragment - how? new_elem.name = 'pre' new_elem['class'] = cssClass elem.replace_with(new_elem)
def transform_wikipage(page): page = page.replace("~CLEAR~", '<div style="clear: both;"></div>') page = page.replace("retroherna.cz", "retroherna.org") page = BeautifulSoup(page, "lxml") for a in page.find_all('a'): if a.get('href') and "/wiki/doku.php" in a['href']: a['href'] = a['href'].replace("/wiki/doku.php?id=web:", "/").replace(':', '/') for img in page.find_all('img'): if not img['src'].startswith("http"): img['src'] = img['src'].replace( "/wiki/lib/exe/fetch.php", "https://retroherna.org/wiki/lib/exe/fetch.php") title = img.get('title') parent = img.parent if parent.name == "a" and parent['href'].startswith("/wiki"): parent.name = "div" del parent['href'] else: parent = page.new_tag("div") img.wrap(parent) classes = img.get('class') parent['class'] = classes + [" mediawrap"] if 'mediacenter' in classes and img.get('width'): # life is too short parent['style'] = 'width: {}px;'.format(img['width']) del img['class'] # XXX yes this is necessary, thanks dokuwiki if title and not any( title.endswith(t) for t in ("png", "jpg", "jpeg", "gif")): title = page.new_tag("div") title['class'] = "mediatitle" if img.get('width'): title['style'] = "max-width: {}px;".format(img['width']) title.string = img['title'] parent.append(title) page = page.html.body page.name = "section" #page = "".join(str(page)) return page
def transformColorizeCode(soup, cssClass, sourceLang): for elem in soup.select('pre.' + cssClass): input_str = elem.string if len(input_str) > 0 and input_str[0] == '\n': # hack for leading empty line input_str = input_str[1:] colorized = sourceHighlight(input_str, sourceLang) origTitle = elem.get('title', None) # source-highlight generates <pre><tt>...</tt></pre>, get rid of <tt> new_elem = BeautifulSoup(colorized, 'lxml').tt # XXX: parse just a fragment - how? new_elem.name = 'pre' new_elem['class'] = cssClass if origTitle is not None: # Preserve title (hover tool tip) new_elem['title'] = origTitle elem.replace_with(new_elem)
def parse_content(self): title = self.driver.find_element_by_css_selector( 'span#ai_cm_title').text body = BeautifulSoup(self.driver.page_source, 'html.parser') body = body.select('wrap_copy')[0] body.name = 'div' self.body_process(body) for div in body.select('div.comment_crop_href_mp4'): div.extract() for div in body.findAll('div', id=lambda x: x and x.startswith('show_')): div.extract() body = str(body) item = Item(title=title, text=body, created_at=datetime.now()) print(f'created {item.title}') self.driver.back() return item
outfile = open(join(postpath, getpagename(pager)), "w+", encoding="utf-8") body = BeautifulSoup(open(join(postpath, filename), "r", encoding="utf-8"), "html.parser").body for image in body.find_all("img"): if checkfileattr(image, "media", "src"): if isfile(join(postpath, image["src"] + ".jpg")): image["src"] += ".jpg" image.parent["href"] += ".jpg" elif isfile(join(postpath, image["src"] + ".png")): image["src"] += ".png" image.parent["href"] += ".png" body.name = "div" sketch.body.append(body) postscount += 1 if postscount == postsperpage: postscount = 0 pager += 1 if postscount == 0: pager -= 1 if pager > 1: prevpage = sketch.new_tag("a", href=getpagename(pager - 1)) prevpage.string = "Previous Page" sketch.body.append(prevpage) outfile.write(sketch.prettify()) outfile.close() print("The generated HTML file(s) are saved as " +
def make_top_level_tag_body(story: BeautifulSoup): story.name = "body" del story['class']
def create_transcript(url_transcript, url_image, page_nb, path_to_doc, document_title, document_desc, document_lang): """Create xml file containing a page transcript from Transkribus, in PAGE standard. File is name after corresponding page number. :param url_transcript: url to request transcript, provided by Transkribus. :param url_image: url to request image of current document/subcollection page. :param page_nb: current page number. :param path_to_doc: path to directory for the current document/subcollection. :param document_title: title of current document/subcollection. :param document_desc: description of current document/subcollection. :param document_lang: list of languages for current document/subcollection, separated by commas. :type url_transcript: string :type url_image: string :type page_nb: integer :type path_to_doc: string :type document_title: string :type document_desc: string :type document_lang: string :return: a status to signal possible server errors. :rtype: boolean """ response = requests.request("GET", url_transcript) document_title = "<title>%s</title>" % document_title document_desc = "<desc>%s</desc>" % document_desc document_page_nb = "<pagenumber>%s</pagenumber>" % page_nb tag_title = BeautifulSoup(document_title, "xml") tag_desc = BeautifulSoup(document_desc, "xml") tag_page_nb = BeautifulSoup(document_page_nb, "xml") tag_title = tag_title.title.extract() tag_desc = tag_desc.desc.extract() tag_page_nb = tag_page_nb.pagenumber.extract() tag_title.name = "tu:title" tag_desc.name = "tu:desc" tag_page_nb.name = "tu:pagenumber" if len(document_lang) != 0: document_lang = ''.join([ "<language>%s</language>" % l.strip() for l in document_lang.split(",") ]) document_lang = "<languages>%s</languages>" % document_lang tag_languages = BeautifulSoup(document_lang, "xml") tag_languages = tag_languages.languages.extract() tag_lang_list = tag_languages.findAll("language") for tag in tag_lang_list: tag.name = "tu:language" if response.status_code == 503: error = True else: error = False xml = response.text path_to_file = os.path.join(path_to_doc, "%s.xml") % page_nb soup = BeautifulSoup(xml, "xml") # Adding namespace declaration for element added by Time Us project # Adding attributes to Page elements : @timeUs:url and @timeUs:id if soup.PcGts: soup.PcGts["xmlns:tu"] = "timeUs" soup.Page["tu:url"] = url_image soup.Page["tu:id"] = page_nb soup.Metadata.append(tag_title) soup.Metadata.append(tag_desc) soup.Metadata.append(tag_page_nb) if len(document_lang) != 0: for tag in tag_lang_list: soup.Metadata.append(tag) with open(path_to_file, "w") as f: f.write(str(soup)) return error
#Download the corresponding image path_to_img = os.path.join( path_to_doc_dir, "%s") % (page["imgFileName"]) response = requests.get( page_url_img) with open(path_to_img, 'wb') as file: file.write(response.content) t_title = "<title>%s</title>" % doc_title tag_title = BeautifulSoup( t_title, "xml") tag_title = tag_title.title.extract( ) tag_title.name = "temp:title" t_uploader = "<uploader>%s</uploader>" % doc_uploader tag_uploader = BeautifulSoup( t_uploader, "xml") tag_uploader = tag_uploader.uploader.extract( ) tag_uploader.name = "temp:uploader" t_desc = "<desc>%s</desc>" % doc_desc tag_desc = BeautifulSoup( t_desc, "xml") tag_desc = tag_desc.desc.extract() tag_desc.name = "temp:desc" t_nb = "<pagenumber>%s</pagenumber>" % page_nb
def func1(): try: headers = { 'Host': "www.mzitu.com", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', } #proxy = {"http":"10.10.1.10:3128", "https":"10.10.1.10:1080"} html = requests.get( "http://www.mzitu.com/zipai/comment-page-1/#comments", headers=headers) #html = request.urlopen(r) print(html.text) html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ other_doc = str(html.text) print(type(other_doc)) soup = BeautifulSoup(html, "html.parser") print(soup.prettify()) print(soup.title) print(soup.title.name) print(soup.p) print(soup.title.string) print(soup.find_all("a")) print(soup.find(id="link3")) for link in soup.find_all("a"): print(type(link)) print(link["href"]) print(type(soup.a["href"])) print(soup.get_text) print(soup.getText()) tag = soup.find("a") tag.name = "bee" print(soup.getText()) print(soup.get_text) print(type(soup.title.string)) # print(unicode(soup.title.string)) soup.name = "bee" print(soup.name) except ConnectionError as a: return except requests.HTTPError as b: return except TimeoutError as c: return except requests.TooManyRedirects as d: return except requests.exceptions.RequestException as e: print(e.args[0]) return else: print("没有异常") finally: print("完成") return
def popover(number, note): # we use BeautifulSoup to fix broken markup, e.g. incomplete span tags. note = BeautifulSoup(normalize_whitespace(note)).find('body') note.name = 'div' a = new_tag( soup, 'a', new_tag(soup, 'sup', number), **{ 'style': 'text-decoration: underline; cursor: pointer;', 'class': 'popover-note', 'data-original-title': 'Note %s' % number, 'data-content': unicode(note), }) return unicode(a)