def parse_begin_xxx(self, m, root): symbol = m.group(1) if symbol in ['html', 'HTML']: new_tag = BeautifulSoup(m.group(2), 'html.parser').contents[0] elif symbol in ['example', 'EXAMPLE']: new_tag = self.soup.new_tag('pre') new_tag['class'] = 'example' new_tag.string = m.group(2) elif symbol in ['quote', 'QUOTE']: new_tag = self.soup.new_tag('blockquote') # new_tag.string = m.group(2) for part in re.split('\n{2,}', m.group(2)): new_p_tag = self.soup.new_tag('p') new_p_tag.string = part new_tag.append(new_p_tag) elif symbol in ['verse', 'VERSE']: new_tag = self.soup.new_tag('p') new_tag['class'] = 'verse' new_tag.string = m.group(2) elif symbol in ['center', 'CENTER']: new_tag = self.soup.new_tag('div') new_tag['class'] = 'center' new_tag.string = m.group(2) else: raise RuntimeError('Not supportted begin symbol: %s' % symbol) root.append(new_tag)
def generate_rss_item(self): item = BeautifulSoup(features="xml").new_tag("item") bare_tags = { "title": self.name, "itunes:duration": self.duration, "description": self.description, "itunes:subtitle": self.description, "itunes:summary": self.description, } for t, v in bare_tags.items(): tag = BeautifulSoup(features="xml").new_tag(t) tag.string = v if v is not None else "" item.append(tag) guid = BeautifulSoup(features="xml").new_tag("guid", isPermaLink="false") guid.string = self.storage_key item.append(guid) url = f"{CDN_BASE_URL}/{self.storage_key}" item.append( BeautifulSoup(features="xml").new_tag( "enclosure", url=url, type="audio/mpeg" ) ) return item
def page_soupify(self, working_dir, html_filename): """ Assumption: working only with normal bootstrap-type file organization. That is, if the index.html file is located at a directory /home/user/examples, then only include javascript and CSS from directories of type /home/user/examples/js, that is, only one level deep. """ if not working_dir.endswith('/'): working_dir += '/' file_fullpath = working_dir + html_filename self.index_soup = BeautifulSoup(open(file_fullpath)) #NOTE(rushiagr): Assumes that all the <link> tags inside <head> # are for CSS files which lie locally! # Create a <style> tag for every <link> tag links = self.index_soup.head.find_all('link') for i in range(len(links)): link_media = links[i].get('media') style_tag = BeautifulSoup().new_tag( 'style', media=link_media, type='text/css' ) style_data = ''.join(line for line in \ open(working_dir+links[i].get('href')).readlines()) style_tag.string = style_data self.index_soup.head.append(style_tag) for i in range(len(links)): self.index_soup.head.link.decompose() # Create a <script> tag, which contains ALL the javascript embedded # in it, for every existing <script> tag. As you can see, the method # is going to be slightly different than above. scripts = self.index_soup.head.find_all('script') script_filenames = [] for script in scripts: script_filenames.append(script.get('src')) for i in range(len(scripts)): self.index_soup.head.script.decompose() for i in range(len(scripts)): script_tag = BeautifulSoup().new_tag('script') script_data = ''.join(line for line in \ open(working_dir+script_filenames[i]).readlines()) script_tag.string = script_data self.index_soup.head.append(script_tag) outfile = open(file_fullpath[:-5]+'_output.html', 'w') outfile.write(self.index_soup.prettify()) outfile.close()
def process(filename): with open(filename) as f: soup = BeautifulSoup(f, "lxml") div = soup.find("div", class_="wy-side-nav-search") a = BeautifulSoup( """<a href="http://www.helsinki.fi" style="margin-bottom: 0px;"><img src="https://uni.materialbank.net/NiboWEB/uni/getPublicFile.do?uuid=146263&inline=false&ticket=8a2a112700dc87abd2813d55e149bc0c&type=original" style="margin-bottom: 0px;max-width: 60%;height: auto;width: auto;"></a>""", "html.parser") div.insert(0, a) divs = soup.find_all("div", class_="admonition") for d in divs: if len(d.contents) != 1: continue m = re.match(r"\n*(Exercise \d+ \([\w ]+\))", d.contents[0].string) if m: exercise = m[1] a = soup.new_tag("a", id=exercise.replace(" ", "-")) a.string = exercise d.string = "" d.append(a) #d.string = '<a name="%s">%s</a>' % (exercise.replace(" ", "-"), exercise) #print("\n", d) with open(filename, "w") as f: f.write(str(soup))
def _processing_attachment(self, matched): file_path = matched.group('post_path') download_url = 'http://{tistory_url}/attachment/{pre_path}@{post_path}'.format( tistory_url='{user_name}.tistory.com'.format(user_name=self.user_name), pre_path=matched.group('pre_path'), post_path=file_path) dir_path = self.dir_path download_path = '{dir_path}/{file_path}'.format( dir_path=dir_path, file_path=file_path ) self._file_data_to_download.append((download_url, dir_path, file_path)) if 'image/jpeg' in matched.group('attr'): return u'<img src="{site_url_tmpl}{download_path}" {attr}>'.format( site_url_tmpl='{{site.url}}/', download_path=download_path, attr=matched.group('attr')) else: tag = u'<a href="{site_url_tmpl}{download_path}" {attr}></a>'.format( site_url_tmpl='{{site.url}}/', download_path=download_path, attr=matched.group('attr')) soup = BeautifulSoup(tag).a soup.string = soup['filename'] return unicode(soup)
def gen_nojs(sibling): nojs_link = BeautifulSoup().new_tag('a') nojs_link['href'] = '/window?location=' + sibling['href'] nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] sibling.append(BeautifulSoup('<br><hr><br>', 'html.parser')) sibling.append(nojs_link)
def rando(): #clip off the last semicolon then split up the separate querries reqs = request.query_string.decode('utf-8')[:-1].split(";") querry = [] #make a list of querries to quinterest for r in reqs: querry.append(formatreq(r)) if (len(querry) > 25): querry = querry[:25] questions = [] for q in querry: out = get("http://quinterest.org{}".format(q)).text out = BeautifulSoup(out, 'html.parser').find_all(attrs={"class": "row"}) out.pop(0) for e in out: #insert the query and replace button at the end of the question querystr = q[23:] querystr = sub('amount=[0-9]+', 'amount=1', querystr) span = BeautifulSoup( '<span class="subjTag" style="display:none"></span>').span repbutton = BeautifulSoup( '<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>' ).button span.string = querystr e.div.append(span) e.div.append(repbutton) questions.append(str(e)) questions = processQuestions(questions) return ("<br>".join(questions))
def find_all_p(self, segment): def skip_p(p): text_is_unicode_space = lambda x: len(x) <= 2 and (chr(194) in x or chr(160) in x) no_text = p.text == "" or p.text == "\n" or p.text.replace(" ", "") == "" or text_is_unicode_space( p.text.encode('utf-8')) return no_text and not p.find("img") ps = segment.find_all("p") new_ps = [] temp_p = "" for p_n, p in enumerate(ps): if skip_p(p): continue elif len(p.text.split()) == 1 and re.compile(u"^.{1,2}[\)|\.]").match(p.text): # make sure it's in form 1. or ש. temp_p += p.text elif p.find("img"): img = p.find("img") if "pages/images/hard.gif" == img.attrs["src"]: temp_p += "*" elif "pages/images/harder.gif" == img.attrs["src"]: temp_p += "**" else: if temp_p: temp_tag = BeautifulSoup("<p></p>", "lxml") temp_tag = temp_tag.new_tag("p") temp_tag.string = temp_p temp_p = "" p.insert(0, temp_tag) new_ps.append(p) return new_ps
def rando(): #clip off the last semicolon then split up the separate querries reqs = request.query_string.decode('utf-8')[:-1].split(";") querry = [] #make a list of querries to quinterest for r in reqs: querry.append(formatreq(r)) if(len(querry) > 25): querry = querry[:25] questions = [] for q in querry: out = get("http://quinterest.org{}".format(q)).text out = BeautifulSoup(out, 'html.parser').find_all(attrs={"class":"row"}) out.pop(0) for e in out: #insert the query and replace button at the end of the question querystr = q[23:] querystr = sub('amount=[0-9]+','amount=1',querystr) span = BeautifulSoup('<span class="subjTag" style="display:none"></span>').span repbutton = BeautifulSoup('<button class="btn repbutton" onclick="replaceQuestion($(this))">Replace This Question</button>').button span.string = querystr e.div.append(span) e.div.append(repbutton) questions.append(str(e)) questions = processQuestions(questions) return ("<br>".join(questions))
def encodeScript(line): sc = BeautifulSoup(line, "html.parser").find("script") if(sc.get("src")): sc["src"] = encodeBase64(sc.get("src")) else: sc.string = pattern.sub( lambda x: repr(encodeBase64(x.group(2), dirname)), sc.string) return sc.prettify()
def generate_rss_channel(self): channel = BeautifulSoup(features="xml").new_tag("channel") bare_tags = { "title": self.name, "description": self.description, "language": "en-us", "docs": "http://www.rssboard.org/rss-specification", "generator": "myself", "lastBuildDate": datetime.now().ctime(), } for t, v in bare_tags.items(): tag = BeautifulSoup(features="xml").new_tag(t) tag.string = v channel.append(tag) # Links lt = BeautifulSoup(features="xml").new_tag("link") lt.string = self.url channel.append(lt) lta = BeautifulSoup(features="xml").new_tag( "atom:link", href=self.url, rel="self" ) channel.append(lta) # iTunes category and friends cat = BeautifulSoup(features="xml").new_tag( "itunes:category", text="Technology" ) cat.append( BeautifulSoup(features="xml").new_tag("itunes:category", text="Podcasting") ) channel.append(cat) channel.append( BeautifulSoup(features="xml").new_tag( "itunes:image", href="https://timbrook-podcast.sfo2.digitaloceanspaces.com/podcover.png", ) ) expl = BeautifulSoup(features="xml").new_tag("itunes:explicit") expl.string = "yes" channel.append(expl) return channel
def format(content): bs = BeautifulSoup(content, "html.parser") if bs.div is None: for _img in bs.find_all("img"): tex = BeautifulSoup("", "html.parser").new_tag("tex") tex.string = "\\" + _img["latex"] _img.replace_with(tex) strs = str(bs) return strs
def get_trans_text(): url = 'https://translate.google.cn/#view=home&op=translate&sl=zh-CN&tl=en&text=%3Cdiv%20class%3D%22dpl-box-title%22%3E%0A%20%20%20%20%20%20%20%20%20%20%20%20%E8%B4%A7%E5%93%81%E7%B1%BB%E5%9E%8B%0A%20%20%20%20%20%20%20%20%3C%2Fdiv%3E' # req = request.urlopen(url) wd = webdriver.Chrome(executable_path=os.path.join( os.path.dirname(__file__), 'library/chromedriver.exe')) wd.get(url) time.sleep(10) html_text = wd.page_source wd.quit() print(html_text) soup = BeautifulSoup(html_text, features="html.parser") print(soup.string())
def _merge_consecutive_symbols(consecutive_char_sequence: List[BeautifulSoup], consecutive_char_indices: List[int], base_tag: BeautifulSoup) -> NodeSymbol: base_tag['s2:start'] = consecutive_char_sequence[0]['s2:start'] base_tag['s2:end'] = consecutive_char_sequence[-1]['s2:end'] base_tag['s2:index'] = consecutive_char_sequence[0]['s2:index'] base_tag.string = ''.join( list(map(lambda node: node.string, consecutive_char_sequence))) node_clone = _clean_node_of_annotations(base_tag) return NodeSymbol(characters=consecutive_char_indices, mathml=str(node_clone), node=base_tag)
def get_dc_row(element, qualifier, value): """ Parameters: element - xml element qualifier - xml qualifier value - value to be written in the xml file for the the specific element and qualifier Returns: The newly created xml file row with parsed from the supplied information <dcvalue element="date" qualifier="issued">2018-04</dcvalue> """ row = BeautifulSoup("<dcvalue></dcvalue>", "xml").dcvalue row['element'] = element row['qualifier'] = qualifier row.string = value return row
def append_nojs(result: BeautifulSoup) -> None: """Appends a no-Javascript alternative for a search result Args: result: The search result to append a no-JS link to Returns: None """ nojs_link = BeautifulSoup(features='html.parser').new_tag('a') nojs_link['href'] = f'/{Endpoint.window}?location=' + result['href'] nojs_link.string = ' NoJS Link' result.append(nojs_link)
def get_header_to_link(html): for title in html.find_all('h3'): # Add a link to search on how to do the achievement link = BeautifulSoup().new_tag( "a", href= f'http://www.google.com/search?q=halo+{title.string}+achievement') # Open in new tab on click link["target"] = "_blank" link.string = f'{title.string}' title.string.replace_with(link) return str(html)
def append_nojs(result: BeautifulSoup) -> None: """Appends a no-Javascript alternative for a search result Args: result: The search result to append a no-JS link to Returns: None """ nojs_link = BeautifulSoup(features='html.parser').new_tag('a') nojs_link['href'] = '/window?location=' + result['href'] nojs_link['style'] = 'display:block;width:100%;' nojs_link.string = 'NoJS Link: ' + nojs_link['href'] result.append(BeautifulSoup('<br><hr><br>', 'html.parser')) result.append(nojs_link)
def getitem(speech, command, data): parse_count = 0 no_check = 0 tag = None # default tags for item in tag_mapper: if item in speech: parse_count = speech.find(item) tag = data.new_tag(tag_mapper[item]) break else: # snippets items = os.listdir(cache_dir) for item in items: tmp = item.split(".")[0] if tmp in speech: parse_count = speech.find(tmp) with open(cache_dir + item, "r") as f: tag = BeautifulSoup(f.read(), features="html.parser") no_check = 1 break speech = speech[parse_count:].split() try: if not no_check: check_content = speech.index("content") tag.string = speech[check_content + 1] check_class = speech.index("class") class_ = speech[check_class + 2] tag["class"] = class_ except ValueError: pass if data and tag: data.body.append(tag) savefile(data) speak("Sucessfully added to the html") else: print(data, "#############", tag) speak("Invalid command") return data
def _processing_attachment(self, matched): file_path = matched.group('post_path') download_url = 'http://{tistory_url}/attachment/{pre_path}@{post_path}'.format( tistory_url='{user_name}.tistory.com'.format( user_name=self.user_name), pre_path=matched.group('pre_path'), post_path=file_path) dir_path = self.dir_path download_path = '{dir_path}/{file_path}'.format(dir_path=dir_path, file_path=file_path) self._file_data_to_download.append((download_url, dir_path, file_path)) if 'image/jpeg' in matched.group('attr'): return u'<img src="{site_url_tmpl}{download_path}" {attr}>'.format( site_url_tmpl='{{site.url}}/', download_path=download_path, attr=matched.group('attr')) else: tag = u'<a href="{site_url_tmpl}{download_path}" {attr}></a>'.format( site_url_tmpl='{{site.url}}/', download_path=download_path, attr=matched.group('attr')) soup = BeautifulSoup(tag).a soup.string = soup['filename'] return unicode(soup)
def get_article(url): result = {"url": url} if(not validURL(url)): result.update({"error": "url is not valid"}) return result html = urlopen(url) soup = cleanHTML(BS(html), url) # put into <article> atl = BS("<article></article>").article # get title and append to h1 head = BS("<h1></h1>").h1 head.string = soup.title.string result.update({"title": soup.title.string}) atlist = [] for p in soup.findAll("p"): tag = p.parent if(tag not in atlist): atlist.append(tag) #atlist = [p.parent for p in soup.findAll("p")] scored = {} for tag in atlist: scored.update({tag: get_score(tag)}) # get the highest score final_score = 0 content = "" for t, s in scored.items(): if(s >= final_score): final_score = s content = t if(final_score == 0): # all score < 0 result.update({"error": "nothing valualbe is not found"}) return result ## remove div in content #for div in content.findAll("div"): # divs = get_score(div) # if(divs < 20): # div.extract() # if article found, return article if(content.name == "article"): atl = content else: # if no h1 found, put the title as h1 if(content.find("h1") is None): # if previous_sibling is head, append pre1 = content.findPreviousSibling("h1") pre2 = content.findPreviousSibling("h2") if(pre1): atl.append(pre1) elif(pre2): atl.append(pre2) else: atl.append(head) atl.append(content) atl = unicode(atl) result.update({"article": atl}) result.update({"score": final_score}) return result
def transformMissions(self): links1 = self.__getAllMissions(self.__res1) links2 = self.__getAllMissions(self.__res2) links = links1.copy() links.update(links2) #print(links) count = 0 transformedMissions = {} transformedMissions["timestamp"] = { "date": datetime.now().strftime("%d-%m-%y"), "time": datetime.now().strftime("%H-%M-%S") } for name, link in links.items(): count += 1 try: #print("---------------------------" +link+ "---------------------------") time.sleep(0.5) mission = requests.get( 'https://escapefromtarkov.gamepedia.com' + link) mission = BeautifulSoup(mission.text, "html.parser") except requests.exceptions.SSLError as e: print("Error reading mission") #mission = "<h1> Error </h1>" name = mission.select_one("h1").text if name == "Quests" or name == "Quests/zh": continue transformedMissions[name] = ({"favorite": 0}) infoxbox = [] for temp in mission.select(".va-infobox-content"): if "previous:" not in temp.text and "leads to:" not in temp.text: infoxbox.append(temp.text) transformedMissions[name].update({"infobox": infoxbox}) liste = [] for headlines in mission.select("h2 span"): #print(headlines.next_sibling) #print(headlines.attrs) if "class" not in headlines.attrs: continue if headlines.attrs["class"][0] == "mw-headline": temp = headlines.parent for tag in temp.next_siblings: if tag.name == "ul" or "table": #print(tag.name) if (tag.name == "h2") or (tag.name == "table" and "class" in tag.attrs and tag.attrs["class"][-1] == "va-navbox-bottom"): transformedMissions[name].update( {headlines.text: liste}) liste = [] break if hasattr(tag, "text"): liste.append(tag.text.strip()) if tag.name == "table" and "class" in tag.attrs or tag.name == "p" or tag.name == "li": #print(tag.findAll("img")) for image in tag.findAll("img"): #print(image.attrs["src"]) liste.append(image.attrs["src"]) completeSite = mission.find("div", {"id": "bodyContent"}) #print(completeSite.findAll("img")) for editSpan in completeSite.select( "span[class='mw-editsection']"): editSpan.extract() for questList in completeSite.select( "table[class='va-navbox-border va-navbox-bottom']"): questList.extract() for questHeader in completeSite.select("div[class='catlinks']"): questHeader.extract() for infoxbox in completeSite.select("table[class='va-infobox']"): infoxbox.extract() for jumper in completeSite.select("div[class='mw-jump']"): jumper.extract() for hidden in completeSite.select("div[class='noprint']"): hidden.extract() for image in completeSite.select("a[class='image']"): image.attrs["href"] = image.contents[0].attrs["src"] #print(image.attrs["href"]) for aLink in completeSite.select("a"): try: if "class" in aLink.attrs.keys(): for attr in aLink.attrs["class"]: if attr == "image": #print(aLink.contents[0]) pass else: newTag = BeautifulSoup( features="html.parser").new_tag("b") newTag.string = aLink.text aLink.replace_with(newTag) else: newTag = BeautifulSoup( features="html.parser").new_tag("b") newTag.string = aLink.text aLink.replace_with(newTag) except Exception as e: print(e) #print(completeSite) # print(completeSite) transformedMissions[name].update( {"completeSite": str(completeSite)}) if (count % 10) == 0: print(count) #print(mission) #if count == 8: # break print(str(count) + " missions loaded") return transformedMissions
def handle_text(filename, img_keyword, sound_keyword, video_keyword): """ :param paras:file name of a charpter, such like 'Charpter1.txt', without directory path. :result: a html file """ # open file and read paragraphs with open(os.path.join('./text/', filename), 'r+') as f: paras = [p.strip() for p in f.readlines() if len(p) > 4] # read html template with open(r'base.txt', 'r+') as f: template_text = f.read() temp = BeautifulSoup(template_text, "lxml") # replace cover img # cover = temp.find('img', {'id': 'cover'}) # cover['src'] = './pics/cover.jpg' # handle title title = temp.find('h3') title.string = paras[0] temp.title = paras[0] # handle paras text_box = temp.find('div', {'id': 'text'}) js_box = temp.find('script', {'id': 'main'}) count = [0,0] img_pat = re.compile(r'\((\W+?)\)\['+img_keyword+r'(\S+?)\]') sound_pat = re.compile(r'\((\W+?)\)\['+sound_keyword+r'(\S+?)\]') video_pat = re.compile(r'\((\W+?)\)\['+video_keyword+r'(\S+?)\]') for i in range(1, len(paras)): new_p = temp.new_tag('p') new_br = temp.new_tag('br') # handle img in text if img_pat.findall(paras[i]): imgs = img_pat.findall(paras[i])# a list of tuple(text, img_id) for img in imgs: img_result = insert_img(img[1], temp, count) new_img_div, count = img_result[0], img_result[1] text_box.append(new_img_div) new_p.string = re.sub(img_pat, r'\1', paras[i])# delete () and [] # text_box.append(new_p) # text_box.append(new_br) if sound_pat.findall(paras[i]): sounds = sound_pat.findall(paras[i]) new_p.string = re.sub(sound_pat, r'\1', paras[i]) for sound in sounds: new_play_logo = insert_sound(sound[0], sound[1], paras[i], temp) new_p.append(new_play_logo) # text_box.append(new_p) # text_box.append(new_br) if video_pat.findall(paras[i]): videos = video_pat.findall(paras[i]) for video in videos: new_video_link = temp.new_string("<a target='_blank' href='"+insert_video(video[1], paras[i], temp) + ".html'>"+video[0]+"</a>") new_p.string = re.sub(video_pat, new_video_link, new_p.string) new_p = BeautifulSoup(html_parser.unescape(str(new_p)), 'lxml') if not (img_pat.findall(paras[i]) or sound_pat.findall(paras[i]) or video_pat.findall(paras[i])): new_p.string = paras[i] text_box.append(new_p) text_box.append(new_br) with open('audio.txt', 'r+') as f: text = f.read() audio_tag = BeautifulSoup(text, 'lxml').div text_box.append(audio_tag) # add js about sound to html script # with open('static/js/audio.js', 'r+') as f: # audio_js = f.read() # js_box.append(audio_js) with open(filename[:-4] + '.html', 'w+') as f: f.write(temp.prettify("utf-8")) print '==========finish ' + filename + '=========='
def collapse_sections(self) -> None: """Collapses long result sections ("people also asked", "related searches", etc) into "details" elements These sections are typically the only sections in the results page that have more than ~5 child divs within a primary result div. Returns: None (The soup object is modified directly) """ minimal_mode = read_config_bool('WHOOGLE_MINIMAL') def pull_child_divs(result_div: BeautifulSoup): try: return result_div.findChildren( 'div', recursive=False)[0].findChildren('div', recursive=False) except IndexError: return [] if not self.main_divs: return # Loop through results and check for the number of child divs in each for result in self.main_divs: result_children = pull_child_divs(result) if minimal_mode: if len(result_children) in (1, 3): continue else: if len(result_children) < self.RESULT_CHILD_LIMIT: continue # Find and decompose the first element with an inner HTML text val. # This typically extracts the title of the section (i.e. "Related # Searches", "People also ask", etc) label = 'Collapsed Results' for elem in result_children: if elem.text: label = elem.text elem.decompose() break # Create the new details element to wrap around the result's # first parent parent = None idx = 0 while not parent and idx < len(result_children): parent = result_children[idx].parent idx += 1 details = BeautifulSoup(features='html.parser').new_tag('details') summary = BeautifulSoup(features='html.parser').new_tag('summary') summary.string = label details.append(summary) if parent and not minimal_mode: parent.wrap(details) elif parent and minimal_mode: # Remove parent element from document if "minimal mode" is # enabled parent.decompose()
def update_link(self, link: Tag) -> None: """Update internal link paths with encrypted path, otherwise remove unnecessary redirects and/or marketing params from the url Args: link: A bs4 Tag element to inspect and update Returns: None (the tag is updated directly) """ # Replace href with only the intended destination (no "utm" type tags) href = link['href'].replace('https://www.google.com', '') if 'advanced_search' in href or 'tbm=shop' in href: # FIXME: The "Shopping" tab requires further filtering (see #136) # Temporarily removing all links to that tab for now. link.decompose() return result_link = urlparse.urlparse(href) q = extract_q(result_link.query, href) if q.startswith('/'): # Internal google links (i.e. mail, maps, etc) should still # be forwarded to Google link['href'] = 'https://google.com' + q elif '/search?q=' in href: # "li:1" implies the query should be interpreted verbatim, # which is accomplished by wrapping the query in double quotes if 'li:1' in href: q = '"' + q + '"' new_search = 'search?q=' + self.encrypt_path(q) query_params = parse_qs(urlparse.urlparse(href).query) for param in VALID_PARAMS: if param not in query_params: continue param_val = query_params[param][0] new_search += '&' + param + '=' + param_val link['href'] = new_search elif 'url?q=' in href: # Strip unneeded arguments link['href'] = filter_link_args(q) # Add no-js option if self.config.nojs: append_nojs(link) if self.config.new_tab: link['target'] = '_blank' else: if href.startswith(MAPS_URL): # Maps links don't work if a site filter is applied link['href'] = MAPS_URL + "?q=" + clean_query(q) else: link['href'] = href # Replace link location if "alts" config is enabled if self.config.alts: # Search and replace all link descriptions # with alternative location link['href'] = get_site_alt(link['href']) link_desc = link.find_all( text=re.compile('|'.join(SITE_ALTS.keys()))) if len(link_desc) == 0: return # Replace link description link_desc = link_desc[0] for site, alt in SITE_ALTS.items(): if site not in link_desc: continue new_desc = BeautifulSoup(features='html.parser').new_tag('div') new_desc.string = str(link_desc).replace(site, alt) link_desc.replace_with(new_desc) break
post_files = [f for f in listdir(POST_DIR) if isfile(join(POST_DIR, f))] posts = [] for post_file in post_files: # factor the post data with open((POST_DIR + post_file), 'r') as f: post_file_data = f.read() # construct the HTML tree post_soup = Soup(post_file_data, features="html.parser") # find the <title> tag title_soup = post_soup.find("title").extract() title = title_soup.string post_meta_soup = post_soup.find(id="post-meta") title_in_post_soup = Soup(features="html.parser").new_tag("h2") title_in_post_soup.string = title post_meta_soup.append(title_in_post_soup) date_in_post_soup = Soup(features="html.parser").new_tag("p") date_in_post_soup["class"] = "small-gray" date_in_post_soup.string = "Published on " + date_from(post_file) post_meta_soup.append(date_in_post_soup) # find all the <latex> tags latexes = post_soup.find_all('latex') for latex in latexes: # convert the latex to html latex_html = delatex(latex.string) latex.replace_with(Soup(latex_html, features="html.parser")) # create the post html and write it to file # insert the post soup into the template soup
def createElement (element, classID, string): new_tag = BeautifulSoup('<'+element+'></'+element+'>', 'lxml') new_tag = new_tag.find(element) new_tag['class'] = classID new_tag.string = string return new_tag
# -*- coding: utf-8 -*- # 字符串常被包含在tag内.Beautiful Soup用 NavigableString 类来包装tag中的字符串: from bs4 import BeautifulSoup tag = BeautifulSoup("<b class='clas1'>李伟</b>") print tag.string # 李伟 print type(tag.string) # <class 'bs4.element.NavigableString'> # 一个 NavigableString 字符串与Python中的Unicode字符串相同,通过 unicode() 方法可以直接将 NavigableString 对象转换成Unicode字符串 unicode_string = unicode(tag.string) print unicode_string # 李伟 print type(unicode_string) # <type 'unicode'> # tag中包含的字符串不能编辑,但是可以被替换成其它的字符串,用 replace_with() 方法: tag.string = 'lijie' print tag.string tag.string.replace_with('lijiebao') print tag.string # 说明:NavigableString 对象支持 遍历文档树 和 搜索文档树 中定义的大部分属性, 并非全部. # 尤其是,一个字符串不能包含其它内容(tag能够包含字符串或是其它tag),字符串不支持 .contents 或 .string 属性或 find() 方法. # 如果想在Beautiful Soup之外使用 NavigableString 对象,需要调用 unicode() 方法, # 将该对象转换成普通的Unicode字符串,否则就算Beautiful Soup已方法已经执行结束,该对象的输出也会带有对象的引用地址.这样会浪费内存.
def inspect_file(file_name): raw_file = source_path + file_name raw_soup = BeautifulSoup(open(raw_file), from_encoding="utf-8") all_prons = raw_soup.find_all('span', class_="pron") if len(all_prons) > 0: for pron in all_prons: all_labels = pron.find_all('span', class_='lbl') dialects = "" register = "" if len(all_labels) > 0: dialects_tags = pron.find_all('span', class_='geo') register_tags = pron.find_all('span', class_='register') if dialects_tags is not None and dialects_tags != []: d_array = [] for d_tag in dialects_tags: d_array.append(str(d_tag.string)) d_tag.decompose() dialects = ", ".join(sorted(d_array)) if register_tags is not None and register_tags != []: register = str(register_tags[0].string) for r_tag in register_tags: r_tag.decompose() mod_tags = pron.find_all('span', class_='mod') for m_tag in mod_tags: m_tag.decompose() ipa_tag = BeautifulSoup().new_tag("div", **{'class': 'ipa'}) # Find image tag img = pron.find('img') if img is not None: # Extract MP3 link attr_text = img['onclick'] # Extract link text with RegEx m = re.search(r"[^\/]*\.mp3", attr_text) link = m.group(0) # Extract target word attr_word = img['alt'] target = attr_word.replace('Pronunciation for ', '') # Extract IPA of pron, strip brackets etc. raw_ipa = "".join(pron.find_all(text=True)) raw_ipa = re.sub(r'\(|\)|;|\n', '', raw_ipa) raw_ipa = raw_ipa.strip() raw_ipa = re.sub(r'\,$', ' ', raw_ipa) ipa = raw_ipa.strip() ipa_tag.string = ipa ipa_tag['data-audio'] = link if target != "": ipa_tag['data-orth'] = target if dialects != "": ipa_tag['data-dialects'] = dialects if register != "": ipa_tag['data-register'] = register # Replace pron with IPA tag else: ipa_str = str(pron.string) ipa_str = re.sub(r'\(|\)|;', '', ipa_str) ipa_tag.string = ipa_str.strip() pron.replace_with(ipa_tag) f_output = open(raw_file, 'w') f_output.write(str(raw_soup)) f_output.close() return
def createElement(element, classID, string): new_tag = BeautifulSoup('<' + element + '></' + element + '>', 'lxml') new_tag = new_tag.find(element) new_tag['class'] = classID new_tag.string = string return new_tag