def run(self, text): soup = BeautifulSoup(text, 'html.parser') new_soup = BeautifulSoup() content = new_soup.new_tag('div', **{'class': self.content_class}) for tag in soup.children: if isinstance(tag, NavigableString): continue if tag.name not in self.incut_tags and len(tag.contents) == 1 and tag.contents[0].name in self.incut_tags: tag = tag.contents[0] if tag.name in self.incut_tags: if len(content): new_soup.append(content) content = new_soup.new_tag('div', **{'class': self.content_class}) klass = self.incut_class if tag.name == 'iframe': klass += ' ' + self.incut_video_class incut = soup.new_tag('div', **{'class': klass}) incut.append(tag) new_soup.append(incut) else: content.append(tag) if len(content): new_soup.append(content) return new_soup.decode()
def render_to_response(self, context, **kwargs): response = super(ArticleDetail, self).render_to_response(context, **kwargs) if self.request.user.is_staff or self.request.GET.get("preview"): return response cache = caches['default'] if cache.get(context['object'].get_absolute_url()): return cache.get(context['object'].get_absolute_url()) content = response.rendered_content bs = BeautifulSoup(content, "html5lib") imgs = bs.find("div", class_="article-content").find_all("img") for img in imgs: if not img.attrs: continue ns_attrs = img.attrs ns_img = bs.new_tag("img", **ns_attrs) img.insert_before(ns_img) ns_img.wrap(bs.new_tag("noscript")) if img.attrs.get("class") and "lazyload" in img.attrs["class"]: continue img.attrs["class"] = img.attrs.get("class", []) + ["lazyload"] if img.attrs.get("src"): img.attrs["data-src"] = img.attrs.get("src") if img.attrs.get("srcset"): img.attrs["data-srcset"] = img.attrs.get("srcset") img.attrs.pop("srcset", "") gray_gif = "data:image/gif;base64,R0lGODlhAQABAIAAAMLCwgAAACH5BAAAAAAALAAAAAABAAEAAAICRAEAOw==" img.attrs["src"] = gray_gif content = unicode(bs) response.content = content cache.set(context['object'].get_absolute_url(), response) return response
def handle_html_content(self, content): soup = BeautifulSoup(content, 'html.parser') for p_elem in soup.find_all('p'): css = None if 'style' in p_elem.attrs: css = cssutils.parseStyle(p_elem.attrs['style']) text_list = p_elem.text.split() p_new = soup.new_tag('p', style=css.cssText if css else None) for idx, word in enumerate(text_list): if len(self.dorks) <= 0: self.dorks = yield from self.get_dorks() word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href=self.dorks.pop(), style='color:{color};text-decoration:none;cursor:text;'.format( color=css.color if css and 'color' in css.keys() else '#000000' ) ) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = soup.encode('utf-8') return content
def test_feed(): data = urllib2.urlopen(MYFEED).read() tree = BeautifulSoup(data, features='xml') items = tree.find_all('entry') for i in items: # find the ID video = i.find('link', rel='related')['href'].split('/')[-1] # remove extraneous gunk [q.decompose() for q in i.group.find_all('content')] # add or update the enclosure enc = tree.new_tag('link') enc['rel'] = 'enclosure' enc['type'] = 'audio/mpeg' enc['title'] = 'mp3' enc['href'] = 'http://%s:%s/media/%s' % (HOSTNAME, PORT, video) i.append(enc) # add the description desc = tree.new_tag('description') desc.string = i.description.string i.append(desc) return tree.prettify()
def create_new_feed(): config = yaml.load(open('config.yaml')) if not os.path.isfile(config['feed_location']): # Create initial feed info soup = BeautifulSoup( """<feed xmlns="http://www.w3.org/2005/Atom"></feed>""") feed_tag = soup.feed # Create required tags and add them in the feed tag title_tag = soup.new_tag('title') id_tag = soup.new_tag('id') updated_tag = soup.new_tag('updated') feed_tag.append(title_tag) feed_tag.append(id_tag) feed_tag.append(updated_tag) # Put data into the new feeds title_tag.string = config['feed_title'] id_tag.string = config['feed_id'] updated_tag.string = datetime.datetime.now().isoformat('T') feed_file = open(config['feed_location'], 'w') print(config['feed_xml'], file=feed_file) print(feed_tag.prettify(), file=feed_file) feed_file.close()
def serializeLabContent(labContent): #print labContent f = open('template.html', "r") labHtml = f.read() f.close() labTemplate = BeautifulSoup(labHtml) articleSection = labTemplate.find_all('div', id="experiment-article-sections")[0] sectionNumber = 1 for sectionName,sectionContent in labContent: sectionTag = labTemplate.new_tag('section', id="experiment-article-section-"+str(sectionNumber)) articleSection.append(sectionTag) iconTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-icon") iconTag['class']='icon' sectionTag.append(iconTag) headingTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-heading") headingTag['class']='heading' headingTag.append(sectionName) sectionTag.append(headingTag) contentTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-content") contentTag['class']='content' contentTag.append(sectionContent) sectionTag.append(contentTag) sectionNumber +=1 f = open('content.html', "w+") labTemplate = labTemplate.prettify() f.write(labTemplate.encode('utf-8')) f.close()
def split_and_save(sentence, listOfWords, new): while len(sentence) > 0: res = find_word(sentence, listOfWords) word = BeautifulSoup.new_tag(new, "w") if res[0] == True: for i in res[1]: an = BeautifulSoup.new_tag(new, "ana") an['lex'] = i.lex an['transcr'] = i.transcr an['sem'] = i.sem word.append(an) if res[2] < len(sentence) and res[2] > 0: sentence = sentence[res[2]:] elif res[2] == 0: an = BeautifulSoup.new_tag(new, "ana") an['lex'] = sentence[0] word.append(an) sentence = sentence[1:] else: sentence = '' new.append(word) else: if 1 < len(sentence): an = BeautifulSoup.new_tag(new, "ana") an['lex'] = sentence[0] word.append(an) sentence = sentence[1:] else: sentence = ''
def write_counters(): try: if os.path.exists(common.getConfig("rootDir") + "/report/report.html"): pre_rendered = open(common.getConfig("rootDir") + "/report/report.html",'r').read() pre_rendered_html = BeautifulSoup(pre_rendered,'html5lib') warnings = len(re.findall(r'badger-warning', str(pre_rendered_html))) information = len(re.findall(r'badger-success', str(pre_rendered_html))) vulnerabilities = len(re.findall(r'badger-danger', str(pre_rendered_html))) debug = len(re.findall(r'debug-level', str(pre_rendered_html))) new_div_tag = pre_rendered_html.new_tag("div") new_div_tag.string = str(vulnerabilities) pre_rendered_html.find("h1", id="vulnerability_count").append(new_div_tag) new_div_tag1 = pre_rendered_html.new_tag("div") new_div_tag1.string = str(warnings) pre_rendered_html.find("h1", id="warning_count").append(new_div_tag1) new_div_tag2 = pre_rendered_html.new_tag("div") new_div_tag2.string = str(information) pre_rendered_html.find("h1", id="information_count").append(new_div_tag2) new_div_tag3 = pre_rendered_html.new_tag("div") new_div_tag3.string = str(debug) pre_rendered_html.find("h1", id="debug_count").append(new_div_tag3) with open(common.getConfig("rootDir") + "/report/report.html", "w") as fh: fh.write(str(pre_rendered_html.prettify())) fh.close() except Exception as e: common.logger.debug("Error in write_counters: " + str(e))
def add_inflections(): """ 添加变形列表 """ Fobj = open(file_name_value, "r", encoding="utf-8") data = Fobj.read() Fobj.close() soup = BeautifulSoup(data) entrylist = soup.findAll("idx:entry") for entry in entrylist: # 当前的单词 word = entry.find("idx:orth")["value"] # 在infl_list中查找 # infl_index是原型,infl_list是变形 # 所以先在infl_index中查找到位置,再到infl_list中查找 if word in infl_index: pos = infl_index.index(word) else: continue if len(infl_list) <= 1: continue idx_infl_tag = soup.new_tag("idx:infl") for x in range(1, len(infl_list[pos])): idx_iform_tag = soup.new_tag("idx:iform", value=infl_list[pos][x]) idx_infl_tag.append(idx_iform_tag) # 在idx:orth标签后面添加一组标签 entry.find("idx:orth").insert_after(idx_infl_tag) name_split = file_name.split(".") name_split.insert(-1, "_add_infl.") file_name_infl = "".join(name_split) Fobj = open(file_name_infl, "w", encoding="utf-8") Fobj.write(str(soup)) Fobj.close()
def buildtable(matrix, section): """Return an XML <informaltable> built from a matrix of rows.""" if section is True: soup = BeautifulSoup('''<section> <title>Insert Title Here</title> <informaltable> <tgroup> </tgroup> </informaltable> </section>''', "xml") else: soup = BeautifulSoup('''<informaltable> <tgroup> </tgroup> </informaltable>''', "xml") # tgroup # cols = 'cols="' + str(len(matrix[1])) + '"' soup.tgroup['cols'] = str(len(matrix[1])) # thead thead = soup.new_tag("thead") soup.tgroup.append(thead) header = matrix.pop(0) thead.append(createrow(header)) # tbody tbody = soup.new_tag("tbody") soup.tgroup.append(tbody) for row in matrix: tbody.append(createrow(row)) return soup.prettify()
def generate_job_item(job,current_job=None): """ Calls and generates HTML snippet for a simple job as list item in the side navigation bark :job_listing: Row Job :current_job: Current job, can be none :rtype: Generated HTML or None """ job_soup = BeautifulSoup() li = job_soup.new_tag("li") job_link = job_soup.new_tag("a", href="/apps/marc_batch/jobs/{0}/".format(job.pk)) job_link.string = job.name li.append(job_link) if current_job is not None: if job.pk == current_job.pk: li['class'] = 'active' job_tasks = job_soup.new_tag("ul") job_tasks["class"] = "nav nav-list" history_li = job_soup.new_tag("li") history_link = job_soup.new_tag("a", href="/apps/marc_batch/jobs/{0}/history/".format(current_job.pk)) history_link.string = "History" history_li.append(history_link) job_tasks.append(history_li) li.append(job_tasks) print(li) return mark_safe(str(li))
def modify_html(self, html, source_article_id): # we need this in order to plot the heatmap soup = Soup(html, 'html.parser') head = soup.find('base') print soup.find("title") if head is not None: head.decompose() css = soup.find("link", {"rel": "stylesheet"}) if css is not None: css['href'] = 'https:' + css['href'] headers = {'user-agent': EMAIL} r = requests.get(css['href'], headers=headers, stream=True) css['href'] = "" if r.status_code == 200: style = soup.new_tag('style') style.string = r.text css.insert_after(style) else: print('FAIL: Cannot load css for id: "%s" ' % source_article_id) css.decompose() last_element_on_page_meta = soup.new_tag('meta') last_element_on_page_meta['http-equiv'] = "content-type" last_element_on_page_meta['content'] = "text/html; charset=utf-8" body = soup.find('body') #if body is not None: last_element_on_page = soup.new_tag('div') last_element_on_page['class'] = "pyqt_is_shit" body.append(last_element_on_page) return soup.prettify(encoding='utf-8')
def dir_links(self, path): soup = BeautifulSoup('<ul></ul>', 'lxml') sublist = soup.ul for root, subFolders, files in os.walk(path): link = 'file://' + root name = root.split('/')[-1] item_tag = soup.new_tag('li') a_tag = soup.new_tag('a', href=link) a_tag.string = name.split('.')[0] item_tag.append(a_tag) sublist.append(item_tag) sublist_tag = soup.new_tag('ul') item_tag.append(sublist_tag) for f in files: link = 'file://' + root + f item_tag = soup.new_tag('li') a_tag = soup.new_tag('a', href=link) a_tag.string = f item_tag.append(a_tag) sublist_tag.append(item_tag) # If next os.walk iteration is going down a level, go down a level in list if subFolders: sublist = sublist.ul return soup.ul
def add_sidebar(content, item): page = BeautifulSoup(content) # Table of contents. table = [] for i in page.section.find_all('h2'): i['id'] = i.string.lower().replace(' ', '-') table.append(i) title = page.new_tag('h1') title.string = 'Table of contents' page.header.append(title) page.header.append(page.new_tag('ul')) for i in table: tag = page.new_tag('li') a = page.new_tag('a', href='#' + i['id']) a.string = i.string tag.append(a) page.header.find_all('ul')[-1].append(tag) return str(page)
def create_new_entry(title_contents, article_contents, link, img_link=''): soup = BeautifulSoup("<entry></entry>") entry_tag = soup.entry id_tag = soup.new_tag('id') title_tag = soup.new_tag('title') updated_tag = soup.new_tag('updated') content_tag = soup.new_tag('content') link_tag = soup.new_tag('link') entry_tag.append(id_tag) entry_tag.append(title_tag) entry_tag.append(updated_tag) entry_tag.append(content_tag) entry_tag.append(link_tag) id_tag.string = link link_tag['href'] = link title_tag.contents = title_contents title_tag['type'] = 'xhtml' content_tag['type'] = 'xhtml' img_tag = soup.new_tag('img') img_tag['src'] = img_link article_contents.insert(0, img_tag) content_tag.contents = article_contents updated_tag.string = datetime.datetime.now().isoformat('T') return entry_tag
def _get_networks_tag(self): bs = BeautifulSoup() networks_tag = bs.new_tag('networks') for key in self.networks: network_tag = bs.new_tag('network') network_tag['sourceType'] = self.networks[key][1]['sourceType'] network_tag['source'] = self.networks[key][1]['source'] network_tag['targetType'] = self.networks[key][1]['targetType'] network_tag['target'] = self.networks[key][1]['target'] network_tag['id'] = key network_tag['isDirected'] = dmlpu.unformat_prop(self.networks[key]['isDirected']) network_tag['allowSelfLoops'] = dmlpu.unformat_prop(self.networks[key]['allowSelfLoops']) network_tag['isBinary'] = dmlpu.unformat_prop(self.networks[key]['isBinary']) e_l = self.networks[key].edge_list() if self.networks[key]['isBinary']: for i in range(len(e_l)): network_tag.append(bs.new_tag('link', source=e_l[i][0], target=e_l[i][1])) else: for i in range(len(e_l)): network_tag.append(bs.new_tag('link', source=e_l[i][0], target=e_l[i][1], value=self.networks[key].es[i]['weight'])) networks_tag.append(network_tag) return networks_tag
def toc_from_headers(html_string): """make a table of contents from headers""" soup = BeautifulSoup(html_string, "html.parser") headers = soup.find_all(name=re.compile("h[1-3]"), id=True) toc_s = "" for h in headers: if h.name == "h1": toc_level = "level-1" elif h.name == "h2": toc_level = "level-2" else: toc_level = "level-3" new_a = soup.new_tag("a", href="#"+h["id"]) if h.string: new_a.string = h.string else: new_a.string = " ".join(h.strings) new_li = soup.new_tag("li") new_li["class"] = toc_level new_li.append(new_a) toc_s += str(new_li)+"\n" return str(toc_s)
def add_meta_tag(page_dir, index_page): google_content = config['WEB-TOOLS']['google'] bing_content = config['WEB-TOOLS']['bing'] if not google_content and not bing_content: return with open('/opt/snare/pages/' + page_dir + "/" + index_page) as main: main_page = main.read() soup = BeautifulSoup(main_page, 'html.parser') if (google_content and soup.find("meta", attrs={"name": "google-site-verification"}) is None): google_meta = soup.new_tag('meta') google_meta.attrs['name'] = 'google-site-verification' google_meta.attrs['content'] = google_content soup.head.append(google_meta) if (bing_content and soup.find("meta", attrs={"name": "msvalidate.01"}) is None): bing_meta = soup.new_tag('meta') bing_meta.attrs['name'] = 'msvalidate.01' bing_meta.attrs['content'] = bing_content soup.head.append(bing_meta) html = soup.prettify("utf-8") with open('/opt/snare/pages/' + page_dir + "/" + index_page, "wb") as file: file.write(html)
def album2html(raw): from bs4 import BeautifulSoup soup = BeautifulSoup(raw) desc = soup.find(id='album-desc') images_orig = soup.find(id='album-images') coverid = images_orig.get('data-cover') images = soup.new_tag('div', id='album-images') for imgtag in images_orig.find_all(['img']): try: idx = imgtag.get('data-id') img = Image.objects.get(idx=idx) self.image_set.add(img) imgtag['src'] = img.thumb_url imgtag['alt'] = img.desc if img.desc else '' imgtag['data-src'] = img.img_url images.append(imgtag) except: # ignore illegal img pass divcover = soup.new_tag('div', id='album-cover') try: coverimg = images.find(lambda tag: tag.get('data-id') == coverid) import copy divcover.append(copy.deepcopy(coverimg)) except: pass return '\n'.join(map(lambda div: div.prettify(), [divcover, desc, images]))
def content_process(content, mode): content = clone_bs4_elem(content) del content['class'] soup = BeautifulSoup( '<html><head></head><body></body></html>') soup.body.append(content) no_script_list = soup.find_all("noscript") for no_script in no_script_list: no_script.extract() if mode == 'answer': img_list = soup.find_all("img", class_=["origin_image", "content_image"]) elif mode == 'post': img_list = soup.find_all("img") for img in img_list: if mode == 'answer': if "content_image" in img['class']: img['data-original'] = img['data-actualsrc'] new_img = soup.new_tag('img', src=PROTOCOL + img['data-original']) elif mode == 'post': #原图的话就不需要replace new_img = soup.new_tag('img', src=PIC_PROTOCOL + img['src'].replace('.jpg','_b.jpg')) img.replace_with(new_img) if img.next_sibling is None: new_img.insert_after(soup.new_tag('br')) useless_list = soup.find_all("i", class_="icon-external") for useless in useless_list: useless.extract() return soup.prettify()
def parse(raw_html, chapter_no): soup = BeautifulSoup(raw_html) charset_tag = soup.head.meta.extract() title_tag = soup.head.title.extract() style_tag = soup.new_tag("link", rel="stylesheet", type="text/css", href="../styles.css") soup.head.clear() soup.head.append(charset_tag) soup.head.append(title_tag) soup.head.append(style_tag) story_text = soup.find("div", id="storytext").extract() story_text.attrs = None chapters = soup.find("select", id="chap_select") chapter = chapters.find("option", value=str(chapter_no)).string soup.body.clear() soup.body.attrs = None chapter_tag = soup.new_tag("h2") chapter_tag.string = chapter soup.body.append(chapter_tag) soup.body.append(story_text) html = str(soup) html = html.replace("<!DOCTYPE html>", XHTML_TRANSITIONAL, 1) return html, chapter_no
def parse_body(self, response): """ 解析正文 :param response: 爬虫返回的response对象 :return: 返回处理后的html文本 """ try: soup = BeautifulSoup(response.content, 'html.parser') body = soup.find_all(class_="x-wiki-content")[0] # 加入标题, 居中显示 title = soup.find('h4').get_text() center_tag = soup.new_tag("center") title_tag = soup.new_tag('h1') title_tag.string = title center_tag.insert(1, title_tag) body.insert(1, center_tag) html = str(body) # body中的img标签的src相对路径的改成绝对路径 pattern = "(<img .*?src=\")(.*?)(\")" def func(m): if not m.group(3).startswith("http"): rtn = "".join([m.group(1), self.domain, m.group(2), m.group(3)]) return rtn else: return "".join([m.group(1), m.group(2), m.group(3)]) html = re.compile(pattern).sub(func, html) html = html_template.format(content=html) html = html.encode("utf-8") return html except Exception as e: logging.error("解析错误", exc_info=True)
def buildHtml(year): dataFilePath='e:\\patent\\patent-cn-{0}.txt'.format(str(year)) if os.path.exists(dataFilePath): patents=[] with open(dataFilePath,'r') as f: for each_line in f: patent=eval(each_line) if patent['type']=='patent': rate=pattern.findall(patent['rate']) if int(rate[1])>=9: patent['count']=int(rate[0]) patents.append(patent) patents=sorted(patents,key=itemgetter('count'),reverse=True) soup=BeautifulSoup('<html><head></head><body><ul></ul></body></html>') ul=soup.ul for patent in patents[0:20]: li=soup.new_tag('li') a=soup.new_tag('a',href=patent['url'],target='_blank') a.string=patent['title'] li.append(a) li.append(patent['rate']) ul.append(li) htmlFilePath='e:\\patent\\patent-cn-{0}.html'.format(str(year)) with open(htmlFilePath,'w') as f: f.write(soup.prettify())
def highlight_syntax(self, soup): """ Highlight code syntax. :param soup: bs4 instance :return: bs4 instance """ code_tags = soup.find_all('code') for code in code_tags: if code.has_attr('class'): lang = code['class'] code.parent['class'] = "highlight " + lang[0] del code['class'] code.name = "span" in_pre_code = syntax_highlight(lang[0], code.string) if self.config.CLIPBOARD: s = BeautifulSoup("<blockquote></blockquote>") blockquote = s.blockquote blockquote['class'] = 'highlight ' + lang[0] p = s.new_tag('p') a = s.new_tag('a', href="#") a['class'] = 'clipboard' a["data-clipboard-text"] = code.string a["data-clipboard-action"] = "copy" a.append("copy") p.append(a) blockquote.append(p) in_pre_code += str(blockquote) code.parent.replaceWith(in_pre_code) return soup
def extract_table(content, rows=None): dammit = UnicodeDammit(content, ["utf-8", "latin-1", "iso-8859-1"]) soup = BeautifulSoup(dammit.unicode_markup) table = soup.find("table") # removing coordinates table.tr.find_next_sibling("tr").extract() table.tr.find_next_sibling("tr").extract() # moving the link to a new column trs = table.tr.find_next_siblings("tr") more = trs[2].td.find_next_sibling("td") link = trs[3].td.find_next_sibling("td") trs[2].td.extract() trs[3].td.extract() trs[2].decompose() trs[3].decompose() trs[0].append(more) trs[1].append(link) # copyright info, if any if rows: new_td = soup.new_tag("td",colspan=2) new_td.string = " — ".join(rows) new_tr = soup.new_tag("tr") new_tr.append(new_td) trs[1].insert_after(new_tr) s = unicode(table) return u''.join('&%s;' % entities[ord(c)] if ord(c) in entities else c for c in s)
def build_bonita_role_xml(uuid,name,description='',label='',dbid='',with_class=False): """ Build XML for a Bonita Role information """ # Build XML body soup = BeautifulSoup('','xml') tag_role = soup.new_tag('Role') if with_class: tag_role.attrs['class']='Role' tag_uuid = soup.new_tag('uuid') tag_name = soup.new_tag('name') tag_description = soup.new_tag('description') tag_label = soup.new_tag('label') tag_dbid = soup.new_tag('dbid') tag_uuid.string = uuid tag_name.string = name tag_description.string = description tag_label.string = label tag_dbid.string = dbid role_tags = [tag_uuid,tag_name,tag_description,tag_label,tag_dbid] for tag in role_tags: tag_role.append(tag) return unicode(tag_role)
def index(request): # return HttpResponse('Hello from Python!') # return render(request, 'index.html' ) resultsParser = ResultsParser() resultsModel = resultsParser.parse('http://cfrsolo2.com/2016/04-17-16-brooksville_fin.htm') # return render(request, 'adrian0.html') # r = requests.get('http://httpbin.org/status/418') # print r.text # return HttpResponse('<pre>' + r.text + '</pre>') soup = BeautifulSoup() new_img_tag = soup.new_tag("img", style='position: absolute; top: 0; right: 0; border: 0;', src="https://camo.githubusercontent.com/e7bbb0521b397edbd5fe43e7f760759336b5e05f/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f677265656e5f3030373230302e706e67") new_a_tag = soup.new_tag("a", href='https://github.com/orozcoadrian/race-graphs') new_a_tag.append(new_img_tag) soup.append(new_a_tag) years = get_years_from_homepage() for year in years: new_a_tag = soup.new_tag("a", href=year) new_a_tag.string = year soup.append(new_a_tag) new_a_tag.append(soup.new_tag('br')) # self.wfile.write(soup.prettify()) return HttpResponse(soup.prettify())
def __query_params_file(self, rtag, q): # queries are in a dict q # build the query-param XML and write it out to disk soup = BeautifulSoup("<parameters></parameters>", "xml") # float n query tags in the soup for num in q: T_query = soup.new_tag("query") T_type = soup.new_tag("type") T_type.string = "indri" T_number = soup.new_tag("number") T_number.string = num T_text = soup.new_tag("text") T_text.string = "#combine(" + q[num] + ")" T_query.append(T_type) T_query.append(T_number) T_query.append(T_text) soup.parameters.append(T_query) o_file = os.path.join(self.path["RUNS"], rtag + ".indri") # purge the XML declaration introduced by BeautifulSoup and # shape it up for Indri to consume with open(o_file, "w") as f: f.write(self.__shapeup_xml(soup.prettify().split("\n")[1:])) return o_file
def compareandsave(sampleparagraphs, textparagraphs, q, filepath): res = compareParagraps(sampleparagraphs, textparagraphs, q) filesp = os.path.splitext(filepath) tex = BeautifulSoup(features='xml') tex.append(BeautifulSoup.new_tag(tex, 'name')) tex.append(BeautifulSoup.new_tag(tex, 'body')) for par in res[0]: tex.body.append(par) html = tex.prettify('utf-8') with open(filesp[0] + '_res' + filesp[1], 'wb') as file: file.write(html) log = BeautifulSoup(features='xml') log.append(BeautifulSoup.new_tag(log, 'missing')) log.append(BeautifulSoup.new_tag(log, 'errors')) for par in res[1]: log.errors.append(par) for par in res[2]: log.missing.append(par) l = log.prettify('utf-8') # file = open(filesp[0] + '_log' + filesp[1], 'w', encoding='utf-8') with open(filesp[0] + '_log' + filesp[1], 'wb') as file: file.write(l)
def xmlify(filename): """ create an xml representation of the text files :param filename: str name of file """ with codecs.open(filename, 'r', 'utf-8') as infile: raw_rambam = infile.read() chap_index = [getGematria(i.group(1)) for i in re.finditer(ur'@00\u05e4\u05e8\u05e7 ([\u05d0-\u05ea]{1,2})', raw_rambam)] chapters = re.split(ur'@00\u05e4\u05e8\u05e7 [\u05d0-\u05ea]{1,2}', raw_rambam)[1:] assert len(chap_index) == len(chapters) soup = BeautifulSoup(u'<root></root>', 'xml') for index, chapter in zip(chap_index, chapters): x_chapter = soup.new_tag('chapter', num=unicode(index)) soup.root.append(x_chapter) v_indices = [getGematria(i.group(1)) for i in re.finditer(ur'@22([\u05d0-\u05ea]{1,2})', chapter)] verses = re.split(ur'@22[\u05d0-\u05ea]{1,2}', chapter)[1:] assert len(v_indices) == len(verses) for v_index, verse in zip(v_indices, verses): x_verse = soup.new_tag('verse', num=unicode(v_index)) comments = verse.splitlines() for i, comment in enumerate(comments[1:]): x_comment = soup.new_tag('comment', num=unicode(i+1)) x_comment.append(comment) x_verse.append(x_comment) x_chapter.append(x_verse) with codecs.open('./xml/{}'.format(filename.replace('.txt', '.xml')), 'w', 'utf-8') as outfile: outfile.write(unicode(soup.prettify()))
def update_file_meta(abs_path_to_file): """ This function accepts the absolute path to a file that has already been verified as containing learning activity metadata. It fills several roles: - Parse the XML file to determine the activity's <title> value. - Fetch new metadata for this activity from the course_metadata dict. - Update metadata in the XML file for 3 key tags: <available_at>, <due_at>, and <lock_at>. """ global course_metadata, undefined_activities # Open the XML file and instantiate Beautiful Soup parsing. xml_file = open(abs_path_to_file, mode="rt+", encoding="utf-8") # Snag the first line, which contains the good-practice XML declaration. # Beautiful Soup erases it. xml_declaration = xml_file.readline() raw_xml = xml_file.read() soup = BeautifulSoup(raw_xml, "xml") # Get the learning activity's title from the previous semester. Use it to # look up the new metadata, which is stored as a subdict in the dictionary. # If no such entry is found, add it to a list (undefined_acts) to be # returned and ultimately printed to the user. prev_title = soup.title.string try: new_metadata = course_metadata[prev_title] except KeyError: undefined_activities.append(prev_title) return # If an modified title is specified, update it. Otherwise keep the previous # title. if new_metadata["new_title"]: soup.title.string = new_metadata["new_title"] else: pass # If new available, due, or lock times are specified, update them. # If not, delete the times that were copied over from the previous # semester. if new_metadata["new_avail_datetime"]: unlock_at_str = format_datetime(new_metadata["new_avail_datetime"]) else: unlock_at_str = "" if new_metadata["new_due_datetime"]: due_at_str = format_datetime(new_metadata["new_due_datetime"]) else: due_at_str = "" if new_metadata["new_lock_datetime"]: lock_at_str = format_datetime(new_metadata["new_lock_datetime"]) else: lock_at_str = "" tags_to_update = { "unlock_at": unlock_at_str, "due_at": due_at_str, "lock_at": lock_at_str, "all_day_date": "" } for tag in tags_to_update.keys(): try: exec("soup.{}.string = '{}'".format(tag, tags_to_update[tag])) except AttributeError: # If tag is not present in the soup, add it, then provide the # up-to-date string metadata. tag_to_add = soup.new_tag(tag) soup.contents[0].insert(3, tag_to_add) exec("soup.{}.string = '{}'".format(tag, tags_to_update[tag])) for tag in tags_to_update.keys(): try: exec("soup.assignment.{}.string = '{}'".format( tag, tags_to_update[tag])) except AttributeError: pass # Write the updated XML back to the file. Note that, weirdly, the soup # object is a list that always contains exactly 1 entry -- that is, # a "tag" object containing updated XML code. Need to convert it to string. xml_file.truncate(0) xml_file.seek(0) # Give the XML declaration back. xml_file.write(xml_declaration) xml_file.write(str(soup.contents[0])) xml_file.close() return
def addToParentIndex(des, tipe, Xrc): """ adds Xpage or Xbook on parent's index """ title = des.split("/")[-1].replace(".html", "") index = des.replace(os.path.basename(des), "index.html") with open(index, 'r') as f: soup = BeautifulSoup(f, "html.parser") f.close() with open(index, 'w') as f: notebook = "/".join(des.split("/")[2:]) soup.head.title.string = 'TOC of ' + des.split("/")[-1].split(".")[0] if tipe == "Xpage": tr = soup.new_tag('tr') tr["id"] = title tr["onclick"] = "window.location.replace('$LINK$'); updateExplorer_IFrame('$LINK$')".replace( "$LINK$", '\\\\' + Xrc["gh_repo_name"] + '/' + notebook) tr["style"] = "background-color: rgb(55, 57, 58); width: 100vw; box-shadow: gray 2px 2px 2px;" th = soup.new_tag('th') th["scope"] = "row" th["style"] = "border: none; width: 60vw;" th.string = title td = soup.new_tag('td') td["style"] = "border: none; width: 40vw;" td.string = datetime.datetime.fromtimestamp(time.time()).strftime( "%H:%M.%S|$MONTH$ %d %Y by Xbooks[bot]").replace( "$MONTH$", chooseMonth( datetime.datetime.fromtimestamp( time.time()).strftime("%m"))) tr.insert(0, td) tr.insert(0, th) soup.body.select('table')[1].tbody.insert(0, tr) if tipe == "Xbook": notebook = notebook + "/index.html" shutil.copy2( des.replace("docs/", "") + "/card.png", des + "/card.png") ccc.note("copied " + des.replace("docs/", "") + "/card.png to" + des + "/card.png") td = soup.new_tag('td') td["id"] = title div_wrapper = soup.new_tag('div') div_wrapper[ "onclick"] = "window.location.replace('$LINK$'); updateExplorer_IFrame(\'$LINK$\');".replace( "$LINK$", '\\\\' + Xrc["gh_repo_name"] + '/' + notebook) div_wrapper["class"] = "card bg-light mb-3" div_wrapper[ "style"] = "max-width: 20rem; background-color: rgba(39, 39, 39, 0.819) !important; color: rgb(200, 192, 188) !important; border: none; box-shadow: gray 5px 5px 5px;" div_head = soup.new_tag('div') div_head["class"] = "card-header" div_head[ "style"] = "background-color: rgba(37, 37, 37, 0.877) !important; border: none; color: white;" div_head.string = title div_wrapper.insert(0, div_head) if os.path.exists(des.replace("docs/", "") + "/card.png"): div_body = soup.new_tag('div') div_body["class"] = "card-body" img = soup.new_tag('img') img["style"] = "height: 256px; width: 256px; display: block; filter: saturate(0.7) brightness(0.5);" img["src"] = title + "/card.png" div_body.insert(0, img) div_wrapper.insert(-1, div_body) td.insert(0, div_wrapper) if len(soup.table.tr.select("td")) == 3: tr = soup.new_tag('tr') soup.table.insert(0, tr) soup.table.tr.insert(0, td) f.write(soup.prettify(formatter="html")) f.close() ccc.success("adding " + des + " to parent index")
if re.search('^\/', form.get('action')): form['action'] = stype + '://' + hostget + form.get('action') else: form['action'] = stype + '://' + hostget + '/' + form.get('action') else: form['action'] = stype + '://' + hostget + '/' + form.get( 'action').split('/', 1)[1].split('/', 1)[1].split('/', 1)[1] # If the autofill feature was enabled, then open the autofill file # and inject each hidden autofill input type into the form if args['autofill'] is not None: autofile = open(args['autofill'], 'r') for line in autofile: if re.match('^#', line) is None: autoform = soup.new_tag(line) form.insert(0, autoform) autofile.close() phishHtml = soup.prettify(formatter="html") # BeautifulSoup fixes broken HTML, I do not want this to happen for & phishHtml = re.sub('&', '&', phishHtml) # Rewrite CSS url(), first look for all matches urlCSS = re.findall(r'url\((.*)\)', phishHtml) # Loop through matches and replace for urls in urlCSS: if re.search('\)', urls):
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, namespaceHTMLElements, soup=None, store_line_numbers=True, **kwargs): if soup: self.soup = soup else: from bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup("", "html.parser", store_line_numbers=store_line_numbers, **kwargs) # TODO: What are **kwargs exactly? Should they be passed in # here in addition to/instead of being passed to the BeautifulSoup # constructor? super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) # This will be set later to an html5lib.html5parser.HTMLParser # object, which we can use to track the current line number. self.parser = None self.store_line_numbers = store_line_numbers def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): kwargs = {} if self.parser and self.store_line_numbers: # This represents the point immediately after the end of the # tag. We don't know when the tag started, but we do know # where it ended -- the character just before this one. sourceline, sourcepos = self.parser.tokenizer.stream.position() kwargs['sourceline'] = sourceline kwargs['sourcepos'] = sourcepos - 1 tag = self.soup.new_tag(name, namespace, **kwargs) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): from bs4 import BeautifulSoup # TODO: Why is the parser 'html.parser' here? To avoid an # infinite loop? self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return treebuilder_base.TreeBuilder.getFragment(self).element def testSerializer(self, element): from bs4 import BeautifulSoup rv = [] doctype_re = re.compile( r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') def serializeElement(element, indent=0): if isinstance(element, BeautifulSoup): pass if isinstance(element, Doctype): m = doctype_re.match(element) if m: name = m.group(1) if m.lastindex > 1: publicId = m.group(2) or "" systemId = m.group(3) or m.group(4) or "" rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % (' ' * indent, name, publicId, systemId)) else: rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) else: rv.append("|%s<!DOCTYPE >" % (' ' * indent, )) elif isinstance(element, Comment): rv.append("|%s<!-- %s -->" % (' ' * indent, element)) elif isinstance(element, NavigableString): rv.append("|%s\"%s\"" % (' ' * indent, element)) else: if element.namespace: name = "%s %s" % (prefixes[element.namespace], element.name) else: name = element.name rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] for name, value in element.attrs.items(): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): value = " ".join(value) attributes.append((name, value)) for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) indent += 2 for child in element.children: serializeElement(child, indent) serializeElement(element, 0) return "\n".join(rv)
def main() -> int: """ Entry point for the executable. """ parser = argparse.ArgumentParser( description= "Build the Standard Ebooks Manual of Style from a set of .rst files.") parser.add_argument( "source_directory", metavar="SOURCE_DIRECTORY", help= "a directory containing .rst files comprising the Standard Ebooks Manual of Style" ) parser.add_argument("dest_directory", metavar="DEST_DIRECTORY", help="a directory to place the output .php files") args = parser.parse_args() return_code = 0 if not os.path.isdir(args.source_directory): print(f"Not a directory: `{args.source_directory}`") return 1 if not os.path.isdir(args.dest_directory): print(f"Not a directory: `{args.dest_directory}`") return 1 toc = [] header_path = Path(args.source_directory) / "templates" / "header.html" footer_path = Path(args.source_directory) / "templates" / "footer.html" try: with open(header_path, "r", encoding="utf-8") as file: header_html = file.read() except: print(f"Couldn’t open `{header_path}`") return 1 try: with open(footer_path, "r", encoding="utf-8") as file: footer_html = file.read() except: print(f"Couldn’t open `{footer_path}`") return 1 with tempfile.TemporaryDirectory() as work_directory: for filename in os.listdir(args.source_directory): if not filename.endswith(".rst"): continue with open(Path(args.source_directory) / filename, "r", encoding="utf-8") as file: rst = file.read() # Add our special RST roles to the top of the file before processing rst = RST_ROLES + rst result = subprocess.run(["rst2html5"], input=rst.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False) errors = result.stderr.decode().strip() if errors: print(filename) # Because we add the RST roles to the top of the file, we have to subtract those lines to get the # *real* line number in the RST file that the error occurs in. errors = regex.sub( "<stdin>:([0-9]+)", lambda exp: "\tLine {}".format( int(exp.groups()[0]) - RST_ROLES_LINE_COUNT), errors).rstrip() print(errors) return_code = 1 html = result.stdout.decode().strip() matches = regex.findall(r"<h1>(.+?)</h1>", html) if matches: title = matches[0] # Remove empty spans html = regex.sub(r"<span>[^>]*?</span>", "", html, flags=regex.DOTALL) # SE extension: :italics:`abc <def>` will generate a link like so: <i><a href="def">abc</a></i> html = regex.sub(r"<em class=\"i\">([^>]+?) <([^<]+?)></em>", r"""<i><a href="\2">\1</a></i>""", html) # SE extension: change <em class="i"> to <i> html = regex.sub(r"<em class=\"i\">([^<]+?)</em>", r"<i>\1</i>", html) # Change :ws: and :utf: markers to <span>s html = regex.sub(r":(ws|utf):`([^`]+?)`", r"""<span class="\1">\2</span>""", html) # Remove comments html = regex.sub(r"<!--.+?-->", "", html) # Pygments doesn't add colors to html that is just a namespaced attribute, like :html:`xml:lang`. Add that here. html = regex.sub( r"""<code class="html">([a-zA-Z\-:]+?)</code>""", r"""<code class="html"><span class="na">\1</span></code>""", html) root_number = None matches = regex.findall(r"^([0-9]+)\-", filename) if matches: root_number = matches[0] # Now we have some cleaned up HTML. # Start parsing the various <section> and <ol> elements to number them. soup = BeautifulSoup(html, "html.parser") if root_number: # Set the ID on the top-level manual section top_level_section = soup.select("body > section")[0] top_level_section["id"] = root_number # Do the actual numbering process_ids(top_level_section, root_number, 1) # Record the number and its h2 children in the ToC toc_item = TocItem(root_number, title, filename.replace(".rst", "")) for header in soup.select("h2"): toc_item.items.append( TocItem(header.parent["id"], header.text, None)) toc.append(toc_item) # rst2html5 doesn't wrap the first child of <li> elements in <p>. # Try to do that here. for li_item in soup.select("li"): need_wrapping = [] for elem in li_item.contents: if isinstance(elem, NavigableString ) or elem.name not in BLOCK_LEVEL_ELEMENTS: need_wrapping.append(elem) if elem.name in BLOCK_LEVEL_ELEMENTS: break if need_wrapping: new_tag = soup.new_tag("p") for elem in need_wrapping: new_tag.append(elem) li_item.insert(0, new_tag) # Now that we've got our structure done, insert <aside>s that have the section numbers in them. for elem in soup.find_all( "", attrs={"id": regex.compile(r"^[0-9\.]+$")}): aside = soup.new_tag("aside") aside["class"] = "number" # Add a link to the section within the section <aside>, but only if it is not the main section number (like "2" or "8") if regex.match(r"^[0-9]$", elem["id"]): aside.string = elem["id"] else: link = soup.new_tag("a") link["href"] = f"#{elem['id']}" link.string = elem["id"] aside.insert(0, link) elem.insert(0, aside) html = str(soup) # Now that we've added IDs and <aside>s, remove the now-unnecessary "no-numbering" class html = html.replace(" class=\"no-numbering\"", "") # Add a <b> around the first word in a bash command, to highlight it. html = regex.sub(r"<code class=\"bash\">([a-z]+) ", r"""<code class="bash"><b>\1</b> """, html) # Add syntax highlighting around value strings html = regex.sub( r"<code class=\"value\">([^<]+?)</code>", r"""<code class="bash"><span class="s">\1</span></code>""", html) # Remove everything up to and including the body element so that we can add our own headers and footers html = regex.sub(r".+?<body>", "", html, flags=regex.DOTALL) html = regex.sub(r"</body>.*", "", html, flags=regex.DOTALL) # If we use CSS properties like -epub-hyphens, the colorizer considers them errors and adds error coloring. Remove that here. html = regex.sub( r"""<span class="err">-</span><span class="n">(.+?)</span>""", r"""<span class="k">-\1</span>""", html) # Convert spaces to tabs html = regex.sub(r" ", "\t", html) # Add PHP headers and footers html = header_html + html + footer_html # Replace <pre> with <figure>. # Do this last, because editing with BS4 and pretty printing can muck up # spacing in <pre> elements if the elements are removed early html = regex.sub(r"<pre data-language=\"([^\"]+?)\">", r"""<figure><code class="\1 full">""", html) html = regex.sub( r"<pre class=\"([^\"]+?)\" data-language=\"([^\"]+?)\">", r"""<figure class="\1"><code class="\2 full">""", html) html = regex.sub( r"<pre data-language=\"([^\"]+?)\" class=\"([^\"]+?)\">", r"""<figure class="\2"><code class="\1 full">""", html) html = regex.sub(r"</pre>", r"</code></figure>", html) # Fill in <title> elements if filename == "index.rst": version = regex.findall(r"\.\. version: (.+)", rst)[0] html = regex.sub(r"MANUAL_TITLE", "The Standard Ebooks Manual", html) html = regex.sub(r"<section id=\".+?\"", r"<section", html) else: html = regex.sub( r"MANUAL_TITLE", f"{root_number}. {title} - The Standard Ebooks Manual", html) with open(Path(work_directory) / filename.replace(".rst", ".php"), "w", encoding="utf-8") as file: file.write(html) file.truncate() # Now, generate the ToC toc = natsorted(toc, key=lambda x: x.number) toc_html = f"<nav><p><a href=\"/manual/{version}\">The Standard Ebooks Manual of Style</a></p><ol>" for toc_item in toc: toc_html += f"<li><p><a href=\"/manual/{version}/{toc_item.filename}\">{toc_item.number}. {escape(toc_item.title)}</a></p><ol>" for sub_item in toc_item.items: toc_html += f"<li><p><a href=\"/manual/{version}/{toc_item.filename}#{sub_item.number}\">{sub_item.number} {escape(sub_item.title)}</a></p></li>" toc_html += "</ol></li>" toc_html += "</ol></nav>" # Place the ToC and version number into the final files for filename in os.listdir(work_directory): if not filename.endswith(".php"): continue with open(Path(work_directory) / filename, "r", encoding="utf-8") as file: html = file.read() html = html.replace("VERSION", version) if filename != "index.php": html = regex.sub(r"<main(.+?)>", fr"<main\1>{toc_html}", html) # Check if pygments generated any errors (for example, missing quotes in an HTML attribute) if "class=\"err\"" in html: print( f"Error colorized code in `{filename}`. Search the file for `class=\"err\"`." ) with open(Path(args.dest_directory) / filename, "w", encoding="utf-8") as file: file.write(html) file.truncate() return return_code
def genSelectBox(df, session_state): """ This function generates select boxes for choosing the school network Parameters: df (type): 2019 school census dataframe session_state (type): section dataset user_analytics (type): user data by amplitude """ st.write( f""" <div class="main-padding" id="top"> <div class="subtitle-section"> Selecione sua rede </div> </div> """, unsafe_allow_html=True, ) col1, col2, col3, col4 = st.beta_columns([0.3, 0.5, 0.5, 1]) with col1: session_state.state_id = st.selectbox("Estado", utils.filter_place(df, "state")) session_state.state_name = utils.set_state_name( df, session_state.state_id) with col2: options_city_name = utils.filter_place(df, "city", state_id=session_state.state_id) options_city_name = pd.DataFrame(data=options_city_name, columns=["city_name"]) x = int(options_city_name[options_city_name["city_name"] == "Todos"].index.tolist()[0]) session_state.city_name = st.selectbox("Município", options_city_name, index=x) import pathlib from bs4 import BeautifulSoup GA_JS = (""" window.dataLayer = window.dataLayer || []; function municipio(){dataLayer.push('municipio_value': '%s');} """ % session_state.city_name) index_path = pathlib.Path(st.__file__).parent / "static" / "index.html" soup = BeautifulSoup(index_path.read_text()) script_tag_loader = soup.new_tag("script") script_tag_loader.string = GA_JS with col3: options_adiminlevel = utils.filter_place( df, "administrative_level", state_id=session_state.state_id, city_name=session_state.city_name, ) options_adiminlevel = pd.DataFrame(data=options_adiminlevel, columns=["adiminlevel"]) y = int(options_adiminlevel[options_adiminlevel["adiminlevel"] == "Todos"].index.tolist()[0]) session_state.administrative_level = st.selectbox( "Nível de Administração", options_adiminlevel, index=y) with col4: st.write( f""" <div class="container main-padding"> <br><br> </div> """, unsafe_allow_html=True, )
def cleanup(html_file): node = int(os.path.basename(html_file).rstrip(".html.zst")) if node in skipNodes: print(f"Skipping ignored node {node}") return if (html_file.endswith('.zst')): with open(html_file, 'rb') as doc: soup = BeautifulSoup(zstd.decompress(doc.read()), 'lxml') else: with open(html_file, 'r') as doc: soup = BeautifulSoup(doc.read(), 'lxml') # Set title to entry name soup.title.string = soup.title.string.split('|')[1].strip() # Remove soft-hypens soup.title.string = soup.title.string.replace(u'\xad', '') # revision is correct for all nodes, but canonical # only for most parts. docpath = soup.find('link', rel='revision') if not docpath: docpath = soup.find('link', rel='canonical') if not docpath: print(f'Failed to parse document name: {html_file}') return url = urllib.parse.unquote(docpath['href']) with open('seen-urls.txt', 'a') as f: f.write(f'{url} {os.path.basename(html_file)}\n') subpath = urllib.parse.urlparse(url).path ignore = True for path in allowed_paths.keys(): if path in subpath: ignore = False break if not ignore: out = f'{subpath[1:]}.html' else: with open('ignored-links.txt', 'a') as f: f.write(f'{url} - {os.path.basename(html_file)}\n') return if os.path.exists(out): print(f'{out} already exists, renaming to {subpath[1:]}_2.html') out = f'{subpath[1:]}_2.html' # ads [ div.decompose() for div in soup.find_all('aside', class_=lambda x: x != 'rule__note') ] # Cookie notice crap [div.decompose() for div in soup.find_all('div', class_='cookie-notice')] # IE stuff [ comment.extract() for comment in soup.findAll( text=lambda text: isinstance(text, Comment)) ] # header [div.decompose() for div in soup.find_all('div', 'tabloid__masthead')] # search-box [div.decompose() for div in soup.find_all('div', 'form-asap')] # Gizmo? [div.decompose() for div in soup.find_all('nav', 'gizmo')] # Footer [ div.decompose() for div in soup.find_all('div', 'tabloid__footer-top-line') ] [div.decompose() for div in soup.find_all('div', 'tabloid__footer-top')] [div.decompose() for div in soup.find_all('div', 'tabloid__footer-bottom')] # all scripts [div.decompose() for div in soup.find_all('script')] [div.decompose() for div in soup.find_all('form')] [div.decompose() for div in soup.find_all('meta')] [div.decompose() for div in soup.find_all('style')] # Any css and co. [div.decompose() for div in soup.find_all('noscript')] [div.decompose() for div in soup.find_all('link')] # Duden mentor [div.decompose() for div in soup.find_all('div', {"id": "block-premium"})] # Add UTF-8 charset new_tag = soup.new_tag('meta', charset='utf-8') soup.head.append(new_tag) # Fix view on mobile new_tag = soup.new_tag('meta', attrs={ 'name': 'viewport', 'content': 'width=device-width, initial-scale=1.0' }) soup.head.append(new_tag) # Disable any referrers new_tag = soup.new_tag('meta', attrs={ 'name': 'referrer', 'content': 'no-referrer' }) soup.head.append(new_tag) # Fixup breadcrumb for link in soup.find_all('a', class_='breadcrumb__crumb'): if link['href'] == '/': link.nextSibling.decompose() link.decompose() break # Add bundle.min.css new_tag = soup.new_tag('link', href=f'/css/bundle.min.css', media='all', rel='stylesheet') soup.head.append(new_tag) for img in soup.find_all('img'): dest = urllib.parse.urlparse(img['src']) with open('media.txt', 'a') as f: f.write(f"{dest.scheme}://{dest.netloc}{dest.path}\n") img['src'] = f'{dest.path}' contains_audio = False contains_notation = False for link in soup.find_all('a'): if not 'href' in link.attrs: # <a id="real, Realität" name="real, Realität">real, Realität</a> continue dest = urllib.parse.unquote(link['href']) dest = urllib.parse.urlparse(dest) is_media = False if 'data-duden-ref-type' in link.attrs: if link['data-duden-ref-type'] in ['audio', 'image']: with open('media.txt', 'a') as f: f.write(f"{dest.scheme}://{dest.netloc}{dest.path}\n") link['href'] = f'{dest.path}' is_media = True if link['data-duden-ref-type'] == 'audio': contains_audio = True if (not is_media) and (not link['href'].startswith('#')): if dest.netloc and ('duden.de' not in dest.netloc): with open('external-links.txt', 'a') as f: f.write(f"{link['href']}\n") continue # This can still be audio eg. grammatik-randummern-* if ('class' in link.attrs) and ('notation__audio' in link['class']): with open('media.txt', 'a') as f: f.write(f"{dest.scheme}://{dest.netloc}{dest.path}\n") link['href'] = f'{dest.path}' contains_notation = True elif dest.fragment: link['href'] = f'{dest.path}.html#{dest.fragment}' else: link['href'] = f'{dest.path}.html' if contains_audio: new_tag = soup.new_tag('script', src='/js/pronunciation-guide.js') soup.body.append(new_tag) elif contains_notation: for js in ["notation", "picturefill"]: new_tag = soup.new_tag('script', src=f'/js/{js}.js') soup.body.append(new_tag) with open(out, 'w') as f: f.write(str(soup))
from bs4 import BeautifulSoup def get_context(path): with open(path, 'r') as file: return file.read() if __name__ == '__main__': content = get_context('..\\requests\\econpy.html') bs = BeautifulSoup(content, 'html.parser') a = bs.new_tag('a', href='https://github.com/gabriel-acuna') a.string = 'Github profile' new_tag = bs.new_tag('div', title='site-data', id='i001', class_='info') new_tag.append('\n') new_tag.append(a) new_tag.append('\n') # append(): add the element at parent element end bs.body.append(new_tag) #insert() bs.body.insert(1, new_tag) print(bs.body)
async def wiki(message, fname, url="https://{lang}.wikipedia.org/w/api.php", query=None, lang=None, lurk=False, prefix="w", **kwargs): w = Wikipya(url=url, lang=lang, lurk=lurk, **kwargs) try: if query is None: command, query = message.text.split(maxsplit=1) page, image, url = await w.get_all( query, lurk, blocklist=WIKIPYA_BLOCKLIST, img_blocklist=kwargs.get("img_blocklist") or (), prefix=prefix) text = fixWords(page.parsed) except NotFound: await message.reply(_("errors.not_found")) return except ValueError: await message.reply(_("errors.enter_wiki_query").format(message.text), parse_mode="Markdown") return soup = BeautifulSoup(text, "lxml") i = soup.find_all("i") b = soup.find_all("b") if len(i) != 0: i[0].unwrap() if len(b) != 0: if url is not None: b = b[0] b.name = "a" b["href"] = url b = b.wrap(soup.new_tag("b")) text = unbody(soup) try: if image != -1: cropped = cuteCrop(text, limit=1024) if cropped == "": cropped = text[:1024] await bot.send_chat_action(message.chat.id, "upload_photo") await message.reply_photo(image, caption=cropped, parse_mode="HTML") else: await message.reply(cuteCrop(text, limit=4096), parse_mode="HTML", disable_web_page_preview=True) except Exception as e: await message.reply(bold(_("errors.error")) + "\n" + code(e), parse_mode="HTML") await message.answer(cuteCrop(text, limit=4096), disable_web_page_preview=True)
def Items(self, opts=None): """ 生成器,返回一个元组 对于HTML:section,url,title,content 对于图片,mime,url,filename,content """ cnt4debug = 0 decoder = AutoDecoder(False) timeout = self.timeout for section, url in self.feeds: cnt4debug += 1 if IsRunInLocal and cnt4debug > 1: break opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) continue if self.page_encoding: content = content.decode(self.page_encoding) else: content = decoder.decode(content,url) content = self.preprocess(content) soup = BeautifulSoup(content, "lxml") try: title = soup.html.head.title.string except AttributeError: self.log.warn('object soup invalid!(%s)'%url) continue title = self.processtitle(title) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): self.keep_only_tags = [self.keep_only_tags] for spec in self.keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: # soup has no body element pass def remove_beyond(tag, next): while tag is not None and getattr(tag, 'name', None) != 'body': after = getattr(tag, next) while after is not None: ns = getattr(tag, next) after.decompose() after = ns tag = tag.parent if self.remove_tags_after: rt = [self.remove_tags_after] if isinstance(self.remove_tags_after, dict) else self.remove_tags_after for spec in rt: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') if self.remove_tags_before: tag = soup.find(**self.remove_tags_before) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id":id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class":cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr:True}): del tag[attr] for tag in soup.find_all(attrs={"type":"text/css"}): tag.decompose() for cmt in soup.find_all(text=lambda text:isinstance(text, Comment)): cmt.extract() if self.keep_image: self.soupbeforeimage(soup) for img in soup.find_all('img',attrs={'src':True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take away it:%s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = urlparse.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = process_image(imgresult.content,opts) if imgresult.status_code==200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "%d.%s" % (random.randint(10000,99999999), 'jpg' if imgtype=='jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.status_code,imgurl)) img.decompose() else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h1 in body.find_all('h1'): # 去掉H1,避免和标题重复 h1.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None content = self.postprocess(content) yield (section, url, title, content, brief)
import os import markdown from bs4 import BeautifulSoup SOURCE_FILE = os.path.join(os.path.dirname(__file__), '..', 'README.md') DEST_PATH = os.path.join(os.path.dirname(__file__), '..', 'build') if not os.path.exists(DEST_PATH): os.makedirs(DEST_PATH) with open(SOURCE_FILE, 'r') as source: html = markdown.markdown(source.read()) soup = BeautifulSoup(html, 'html.parser') # Reconstruct title new_title = soup.new_tag('p') new_title.string = "😱 A dark theme for JetBrains IDEs" soup.find('h1').replace_with(new_title) # Remove badges blockquote_h2 = soup.find('blockquote') blockquote_h2.find_next_sibling('p').decompose() blockquote_h2.decompose() # Set image widths for img in soup.find_all('img'): img['width'] = '700' # Add margin above images for img in soup.find_all('img'): img.insert_before(soup.new_tag('br')) # Remove installation installation_h2 = soup.find('h2', text='Installation') installation_h2.find_next('ol').decompose() installation_h2.decompose()
def docx_to_html(fonts,jumpiness,word_rotation,width_shift,height_shift,rotace,table_header=True): #change all the html files for filee in os.listdir("data\\converted\\docx"): filee_converted = "data\\converted\\docx\\" + filee filee_dest = "data\\done\\" + filee #find text and change it with open(filee_converted,"r",encoding="utf-8") as f: result = f.read() result = result.replace("<em>","") result = result.replace("</em>","") result = result.replace("<strong>","") result = result.replace("</strong>","") result = result.replace("<li>","<p>") result = result.replace("</li>","</p>") result = result.replace("</li>","</p>") result = result.replace("`"," ") if (not table_header): result = result.replace("th","td") result = result.replace("<thead>","") result = result.replace("</thead>","") result = result.replace("<tbody>","") result = result.replace("</tbody>","") result = result.replace("header","even") soup = BeautifulSoup(result,"html.parser") #align on line paper soup.append(soup.new_tag('style', type='text/css')) soup.style.append('body{margin-left:' + str(margin_left) +'cm; line-height:7mm; color:#000F55; word-spacing: 0.25cm;} p{margin:0px;} td:nth-child(even) {padding-right:80px;} td:nth-child(odd) {padding-right:30px;} th {font-weight: normal;} td {padding-top: 0; padding-bottom: 0;} th:nth-child(even) {padding-right:55px;} th:nth-child(odd) {padding-right:30px;}') #1inch nahore offset v chrome/// line-height:7.83mm; (ctvereckovy) /// line-height:6.83mm; linkovany #style pismenka v paragraf for p in soup.find_all("p"): p["style"] = "margin:0px 0px {1}cm {0}px;transform:rotate({2}deg);".format(randrange(width_shift[0],width_shift[1]),0,randrange(rotace[0],rotace[1])) #randomize pismenka line = p.decode_contents() res = "" i = 0 while i < len(line): if (line[i:i + 1] == " "): res += line[i:i + 1] elif (unidecode(line[i:i + 1]) == unidecode("")): res += " " elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span") or (line[i:i + 2] == "<p" or line[i:i + 3] == "</p"): while line[i:i + 1] != ">": res += line[i:i + 1] i += 1 res += ">" elif (line[i:i+1] == "^"): res += "<span style='font-family:{0};top:{1}px;font-size:100%;transform:skewY({2}deg)'>{3}</span>".format("mv boli",randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1]),line[i:i+1]) else: word = ["<span style='font-family:{0};top:{1}px;font-size:170%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"] res += word[0] + line[i:i + 1] + word[1] i += 1 p.string = res #style pismenka v table for t in soup.find_all("table"): #th for th in t.find_all("th"): #randomize pismenka line = th.decode_contents() res = "" i = 0 while i < len(line): if (line[i:i + 1] == " "): res += line[i:i + 1] elif (unidecode(line[i:i + 1]) == unidecode("")): res += " " elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span"): while line[i:i + 1] != ">": res += line[i:i + 1] i += 1 res += ">" else: word = ["<span style='font-family:{0};top:{1}px;font-size:170%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"] res += word[0] + line[i:i + 1] + word[1] i += 1 th.string = res #tr for td in t.find_all("td"): #random left offset td["style"] = "padding-left:%spx;" % randrange(0,8) #randomize pismenka line = td.decode_contents() res = "" i = 0 while i < len(line): if (line[i:i + 1] == " "): res += line[i:i + 1] elif (unidecode(line[i:i + 1]) == unidecode("")): res += " " elif (line[i:i + 5] == "<span" or line[i:i + 6] == "</span"): while line[i:i + 1] != ">": res += line[i:i + 1] i += 1 res += ">" else: word = ["<span style='font-family:{0};top:{1}px;font-size:170%;transform:skewY({2}deg)'>".format(choice(fonts),randrange(jumpiness[0],jumpiness[1]),randrange(word_rotation[0],word_rotation[1])),"</span>"] res += word[0] + line[i:i + 1] + word[1] i += 1 td.string = res #write new file with open(filee_dest,"w",encoding="utf-8") as f: soup = str(soup).replace("<","<") soup = soup.replace(">",">") f.write(str(soup)) print("done")
def perform(self,document,sourceHTML,sourceURL,srcPrefix): aggregateCSS=""; if len(srcPrefix) and not srcPrefix.endswith('/'): srcPrefix = srcPrefix + '/' # retrieve CSS rel links from html pasted and aggregate into one string CSSRelSelector = CSSSelector("link[rel=stylesheet],link[rel=StyleSheet],link[rel=STYLESHEET]") matching = CSSRelSelector.evaluate(document) for element in matching: try: csspath=element.get("href") if len(sourceURL): if element.get("href").lower().find("http://",0) < 0: parsedUrl=urlparse.urlparse(sourceURL); csspath=urlparse.urljoin(parsedUrl.scheme+"://"+parsedUrl.hostname, csspath) f=urllib.urlopen(csspath) aggregateCSS+=''.join(f.read()) element.getparent().remove(element) except: raise IOError('The stylesheet '+element.get("href")+' could not be found') #include inline style elements print aggregateCSS CSSStyleSelector = CSSSelector("style,Style") matching = CSSStyleSelector.evaluate(document) for element in matching: aggregateCSS+=element.text element.getparent().remove(element) #convert document to a style dictionary compatible with etree styledict = self.getView(document, aggregateCSS) #set inline style attribute if not one of the elements not worth styling ignoreList=['html','head','title','meta','link','script','repeater','singleline','multiline','br','layout'] for element, style in styledict.items(): if element.tag not in ignoreList: v = style.getCssText(separator=u'') element.set('style', v) #convert tree back to plain text html self.convertedHTML = etree.tostring(document, method="xml", pretty_print=True,encoding='UTF-8') self.convertedHTML= self.convertedHTML.replace(' ', '') #tedious raw conversion of line breaks. # We've inline styled the CSS, now do the HTML src tags soup = BeautifulSoup(self.convertedHTML) for img in soup.find_all("img"): img['src'] = srcPrefix + img.get('src') # Now we would like to set width and min-width on all our tables for table in soup.find_all("table"): if table.get('width') is not None: width = table.get('width') if not width.endswith('%'): if table.get('style') is None: style = [] else: style = table.get('style').split(';') style = [x for x in style if x] style.append("min-width:" + width + "px") style.append("width:" + width + "px") table['style'] = ';'.join(style) # Might as well go ahead and throw a style tag in the head for iOS fixes if soup.html.head is None: soup.html.insert(0, soup.new_tag('head')) if soup.html.head.style is None: soup.html.head.append(soup.new_tag('style', type="text/css")) soup.html.head.style.append(""" a[href^="x-apple-data-detectors:"] { color: #000000; text-decoration: none; } a[href^="tel"], a[href^="sms"], a[href^="mailto"] { color: #000000; text-decoration: none; } """) for img in soup.find_all('img'): if 'spacer.gif' in img.get('src'): classes = img.get('class') if classes is not None: if 'w' in classes: img.parent['width'] = img.get('width') if 'h' in classes: img.parent['height'] = img.get('height') self.convertedHTML = str(soup) return self
def save_to_html(self, title="", file_name=""): """ title: The <H1> title in the mobi file file_name: Save file as <file_name>.mobi """ if self.is_empty(): raise BriticleException("File is empty") if not title: title = self.title # Generate file name via title if doesn't exist if not file_name: if title: file_name = re.sub(r'[^-\w ]+', '', title).replace(' ', '_') else: file_name = re.sub(r'[^-\w ]+', '', self.title).replace(' ', '_') if not file_name: file_name = "Untitled_Documentation" # Save images to local and change the <img> src to new location i = 1 soup = BeautifulSoup(self.html, 'html.parser') images = soup.find_all('img') print('images: {}'.format(images)) for img in images: if 'src' not in img.attrs: continue src = img['src'] image_ext = src.split(".")[-1] # Set it as PNG when suffix does not exist if len(image_ext) >= 5: image_ext = "png" image_name = "%03d.%s" % (i, image_ext) dir_image = os.path.join(self.save_dir, file_name) if not os.path.exists(dir_image): os.mkdir(dir_image) local_file_name = os.path.join(dir_image, image_name) try: download_to_local(src, local_file_name) except URLError: continue except Exception as e: if 'timed out' in str(e): continue raise new_tag = soup.new_tag("img", src=file_name + "/" + image_name) img.replace_with(new_tag) i += 1 html_file = os.path.join(self.save_dir, file_name + '.html') tags_h1 = soup.find_all('h1') h1_exists = True if (tags_h1 and len(tags_h1) == 1) else False with open(html_file, 'w') as f: html = u"" if h1_exists: hr = soup.new_tag('hr') tags_h1[0].insert_after(hr) else: html = u'<h1>{}</h1>\r\n<hr/>\r\n'.format(title) html += '{}'.format(soup) # FIXME: netloc not correct for URLs ends with "xxx.com.cn" try: netloc = urlparse(self.url).netloc netloc = u".".join(netloc.split(".")[-2:]) except: netloc = u"Original URL" html += u'<br/>From <a href="%s">%s</a>. ' % (self.url, netloc) f.write(html) self.html_file = html_file return html_file
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder): def __init__(self, namespaceHTMLElements, soup=None): if soup: self.soup = soup else: from bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements) def documentClass(self): self.soup.reset() return Element(self.soup, self.soup, None) def insertDoctype(self, token): name = token["name"] publicId = token["publicId"] systemId = token["systemId"] doctype = Doctype.for_name_and_ids(name, publicId, systemId) self.soup.object_was_parsed(doctype) def elementClass(self, name, namespace): tag = self.soup.new_tag(name, namespace) return Element(tag, self.soup, namespace) def commentClass(self, data): return TextNode(Comment(data), self.soup) def fragmentClass(self): from bs4 import BeautifulSoup self.soup = BeautifulSoup("", "html.parser") self.soup.name = "[document_fragment]" return Element(self.soup, self.soup, None) def appendChild(self, node): # XXX This code is not covered by the BS4 tests. self.soup.append(node.element) def getDocument(self): return self.soup def getFragment(self): return treebuilder_base.TreeBuilder.getFragment(self).element def testSerializer(self, element): from bs4 import BeautifulSoup rv = [] doctype_re = re.compile( r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$') def serializeElement(element, indent=0): if isinstance(element, BeautifulSoup): pass if isinstance(element, Doctype): m = doctype_re.match(element) if m: name = m.group(1) if m.lastindex > 1: publicId = m.group(2) or "" systemId = m.group(3) or m.group(4) or "" rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" % (' ' * indent, name, publicId, systemId)) else: rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name)) else: rv.append("|%s<!DOCTYPE >" % (' ' * indent, )) elif isinstance(element, Comment): rv.append("|%s<!-- %s -->" % (' ' * indent, element)) elif isinstance(element, NavigableString): rv.append("|%s\"%s\"" % (' ' * indent, element)) else: if element.namespace: name = "%s %s" % (prefixes[element.namespace], element.name) else: name = element.name rv.append("|%s<%s>" % (' ' * indent, name)) if element.attrs: attributes = [] for name, value in list(element.attrs.items()): if isinstance(name, NamespacedAttribute): name = "%s %s" % (prefixes[name.namespace], name.name) if isinstance(value, list): value = " ".join(value) attributes.append((name, value)) for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) indent += 2 for child in element.children: serializeElement(child, indent) serializeElement(element, 0) return "\n".join(rv)
def readability_by_soup(self, article, url, opts=None): content = self.preprocess(article) soup = BeautifulSoup(content, "lxml") try: title = soup.html.head.title.string except AttributeError: self.log.warn('object soup invalid!(%s)' % url) return title = self.processtitle(title) soup.html.head.title.string = title if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) if self.keep_image: opener = URLOpener(self.host, timeout=self.timeout) self.soupbeforeimage(soup) for img in soup.find_all('img', attrs={'src': True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take away it:%s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) print url print imgurl if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%s_%d.%s" % ( datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code, imgurl)) img.decompose() for img in soup.find_all('img'): #去掉图像上面的链接 if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() #如果没有内容标题则添加 t = soup.html.body.find(['h1', 'h2']) if not t: t = soup.new_tag('h1') t.string = title soup.html.body.insert(0, t) else: totallen = 0 for ps in t.previous_siblings: totallen += len(string_of_tag(ps)) if totallen > 40: #此H1/H2在文章中间出现,不是文章标题 t = soup.new_tag('h1') t.string = title soup.html.body.insert(0, t) break self.soupprocessex(soup) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None yield (title, None, None, content, brief)
if bibtex['ENTRYTYPE'] == "article": formattedTags.append("[[Journal Article]]") elif bibtex['ENTRYTYPE'] == "book": formattedTags.append("Book") elif bibtex['ENTRYTYPE'] == "inproceedings": formattedTags.append("[[Conference Paper]]") elif bibtex['ENTRYTYPE'] == "phdthesis": formattedTags.append("Dissertation") elif bibtex['ENTRYTYPE'] == "mastersthesis": formattedTags.append("Thesis") elif bibtex['ENTRYTYPE'] == "techreport": formattedTags.append("[[Technical Report]]") elif bibtex['ENTRYTYPE'] == "manual": formattedTags.append("[[Technical Manual]]") tag = soup.new_tag('div') authors = bibtex['author'].split('and') for i in range(0, len(authors)): authorSplit = authors[i].split(',') author = "" if len(authorSplit) > 1: author = authorSplit[1].strip() + " " + authorSplit[0].strip() else: author = authorSplit[0].lstrip() authorTag = 'author' + str(i + 1) tag.attrs[authorTag] = author if len(authorSplit) == 1: formattedTags.append(author) else: formattedTags.append("[[" + author + "]]")
def readability(self, article, url, opts=None): """ 使用readability-lxml处理全文信息 """ content = self.preprocess(article) # print '--------------' # print content # print '---------------' # 提取正文 try: doc = readability.Document(content) summary = doc.summary(html_partial=True) except: self.log.warn('article is invalid.[%s]' % url) return title = doc.short_title() title = self.processtitle(title) # print '==================' # print summary # print '===================' soup = BeautifulSoup(summary, 'lxml') # soup = BeautifulSoup(content,'lxml') ''' #没有head h = soup.find('head') if not h: h = soup.new_tag('head') t = soup.new_tag('title') t.string = title h.append(t) soup.html.insert(0,h) #没有h t = soup.html.body.find(['h1','h2']) if not t: t = soup.new_tag('h1') t.string = title soup.html.body.insert(0,t) else: totallen = 0 for ps in t.previous_siblings: totallen += len(string_of_tag(ps)) if totallen > 40: t = soup.new_tag('h1') t.string = title soup.html.body.insert(0,t) break ''' self.soupbeforeimage(soup) if self.remove_tags: for tag in soup.find_all(self.remove_tags): tag.decompose() for id in self.remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in self.remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in self.remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) if self.keep_image: opener = URLOpener(self.host, timeout=self.timeout) for img in soup.find_all('img', attrs={'src': True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take it away : %s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered : %s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%s_%d.%s" % ( datetime.datetime.now().strftime("%Y%m%d_%H%M%S"), self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code, imgurl)) img.decompose() #去掉图像上面的链接 for img in soup.find_all('img'): if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) # print '====-=-=-=-=-=-=-=' # print soup # print '-=-=-=-=-=-=-=-=-=-=-' cc = soup.body.contents[0] # cc.name = "articleblock" # print cc # print soup.body.renderContents() #content = unicode(soup) content = unicode(cc) #print soup.find('body').contents #print soup.body.contents #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None yield (title, None, None, content, brief)
def Items(self, opts=None): decoder = AutoDecoder(False) timeout = self.timeout for section, url in self.feeds: opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) code, content = result.code, result.content if code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (code, url)) continue if self.page_encoding: try: content = content.decode(self.page_encoding) except UnicodeDecodeError: content = decoder.decode(content, opener.realurl) else: content = decoder.decode(content, opener.realurl) content = self.preprocess(content) soup = BeautifulSoup(content, "lxml") h = soup.find('head') if not h: h = soup.new_tag('head') t = soup.new_tag('title') t.string = section h.append(t) soup.html.insert(0, h) try: title = soup.html.head.title.string except AttributeError: title = section title = self.processtitle(title) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: # soup has no body element pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all( text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) if self.keep_image: self.soupbeforeimage(soup) for img in soup.find_all('img', attrs={'src': True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take away it:%s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%s_%d.%s" % ( datetime.datetime.now().strftime( "%Y%m%d_%H%M%S"), self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code, imgurl)) img.decompose() for img in soup.find_all('img'): if img.parent and img.parent.parent and img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) content = unicode(soup) brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None content = self.postprocess(content) yield (section, url, title, content, brief)