def genReport(url, new_scan, date): print "[+] Generating report..." report = BeautifulSoup(open("./templates/template.html", 'r').read()) ### In future do this for each page in list filepath = getFilePath(url, False).split("/") report.oldscan.string = date report.newscan.string = time.strftime("%y-%m-%d") report.target.string = filepath[2] report.targetpage.string = filepath[-1] #Add results diff = report.diff diff.string = "" date, changes = analyzePage(url, new_scan, date) diff.append(report.new_string(changes[0])) for line in changes[1:]: diff.append(report.new_tag('br')) diff.append(report.new_string(line)) #Write changes to compiled report filename = "./reports/" + "/".join(filepath[2:4]) + "/report.html" open(filename, "w+").write(str(report)) print "[+] Saved final report in: " + filename
def _select_info(self, html, rules): """ :param html: 待提取页面 :param rules: 提取规则 :return: 返回值形式为 [item1, item2 ... itemN] """ if html and rules: soup = BeautifulSoup(html, self.parser) item = soup.select(rules) # 增加额外的URL链接到列表 # 当传入规则为获取分类链接的规则时,提取url到列表.将自定义url加入列表 if ADDITIONAL_URL and rules == 'ul.sub-menu > li > ul > li > a': item = [url.get('href') for url in item] item.extend(ADDITIONAL_URL) # temp为 获取的书籍下载链接 temp = get_down_load_link(soup, down_link_rules) if temp: dttag = soup.new_tag("dt") #添加dt同级的dd标签 ddtag = soup.new_tag("dd") #设置值 new_string = soup.new_string("downloadlink: ") dttag.append(new_string) #将书籍的下载链接添加进去 new_string = soup.new_string(temp[0].get('href')) ddtag.append(new_string) item[0].append(dttag) item[0].append(ddtag) for match in item: # yield 返回一个生成器,用for循环迭代 yield match
def get_ip(): ip = request.headers['X-Forwarded-For'].replace(' ','').split(',')[0] #response = requests.get("http://ip-api.com/json/" + ip + "?lang=zh-CN").json() response = requests.get("https://www.ipip.net/ip/"+ip+".html",headers=ua).content _soup = BeautifulSoup(response, 'html.parser') soup = BeautifulSoup(html, 'html.parser') ''' country_name = response['country'] region_name = response['regionName'] city = response['city'] isp = response['isp'] region_info = country_name + ' ' + region_name + ' ' + city ''' #tables = _soup.find_all(style='clear: both') region_info = '' isp = '' tds = _soup.find_all('td') region_info = '' isp = '' for td in tds: if td.string == "地理位置": region_info = td.find_next_sibling('td').span.string if td.string == "运营商": isp = td.find_next_sibling('td').span.string soup.body.append(soup.new_tag('br')) soup.body.append(soup.new_tag('br')) soup.find_all("br")[0].insert_before(soup.new_string(ip)) soup.find_all("br")[0].insert_after(soup.new_string(region_info)) soup.find_all("br")[1].insert_after(soup.new_string(isp)) #print(soup) return str(soup)
def generate_html(self, url, status_url, last_checked_time): with open("AppView.html") as inf: txt = inf.read() soup = BeautifulSoup(txt, "html.parser") #print(soup.prettify()) new_tr = soup.new_tag('tr') new_td_url = soup.new_tag('td') new_td_url.append(soup.new_string(url)) new_td_status_url = soup.new_tag('td') new_td_status_url.append(soup.new_string(status_url)) new_td_last_checked_time = soup.new_tag('td') new_td_last_checked_time.append(soup.new_string(last_checked_time)) # insert it into the document new_tr.append(new_td_url) new_tr.append(new_td_status_url) new_tr.append(new_td_last_checked_time) old_tr = soup.findChildren('tr') for tr in old_tr: old_td = tr.findChildren('td') url_string = old_td[0].getText() if url_string != '': if url_string == url: soup.table.tr.replaceWith(new_tr) else: soup.table.append(new_tr) else: soup.table.tr.replaceWith(new_tr) # save the file again with open("AppView.html", "w") as outf: outf.write(str(soup))
def get_xml_from_dict(params_dict): """ 由字典转为xml字符串 :param params_dict: 字典 :return: xml_str :rtype: str """ soup = BeautifulSoup(features="xml") xml = soup.new_tag("xml") for k, v in params_dict.items(): tag = soup.new_tag(k) if isinstance(v, int): tag.append(soup.new_string(str(v))) elif isinstance(v, (str, unicode)): tag.append(CData(v)) else: for k1, v1 in v.items(): tag1 = soup.new_tag(k1) if isinstance(v1, int): tag1.append(soup.new_string(str(v1))) elif isinstance(v1, (str, unicode)): tag1.append(CData(v1)) tag.append(tag1) xml.append(tag) return str(xml)
def process_references_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup, refs: Dict) -> Dict: """ Process all references in paragraph and generate a dict that contains (type, ref_id, surface_form) :param para_el: :param sp: :param refs: :return: """ tokgen = UniqTokenGenerator('REFTOKEN') ref_dict = dict() for rtag in para_el.find_all('ref'): try: ref_type = rtag.get('type') # skip if citation if ref_type == 'bibr': continue if ref_type == 'table' or ref_type == 'figure': ref_id = rtag.get('target') if ref_id and normalize_grobid_id(ref_id) in refs: # normalize reference string rtag_string = normalize_grobid_id(ref_id) else: rtag_string = None # add to ref set ref_key = tokgen.next() ref_dict[ref_key] = (rtag_string, rtag.text.strip(), ref_type) rtag.replace_with(sp.new_string(f" {ref_key} ")) else: # replace with surface form rtag.replace_with(sp.new_string(rtag.text.strip())) except AttributeError: continue return ref_dict
def write_index_page(self): """ Using an HTML Template, create an index page that lists all of the created calendars. """ LOG.info('Writing index page...') # Copy needed files to the destination for filename in self.ICONS: source = os.path.join("resources", filename) if os.path.exists(source): shutil.copy(source, self.meta.output) with open(self.TEMPLATE, "r") as f: template = f.read() parser = BeautifulSoup(template, 'lxml') # Locate insertion points title = parser.find('title') header = parser.find('div', {'id': 'header'}) footer = parser.find('div', {'id': 'footer'}) # Page title title.insert(0, parser.new_string("WBC %s Event Schedule" % self.meta.year)) header.h1.insert(0, parser.new_string("WBC %s Event Schedule" % self.meta.year)) footer.p.insert(0, parser.new_string("Updated on %s" % self.meta.now.strftime("%A, %d %B %Y %H:%M %Z"))) # Tournament event calendars tourneys = dict([(k, v) for k, v in self.calendars.items() if k not in self.meta.special]) ordering = lambda x, y: cmp(tourneys[x]['summary'], tourneys[y]['summary']) self.render_calendar_table(parser, 'tournaments', 'Tournament Events', tourneys, ordering) # Non-tourney event calendars nontourneys = dict([(k, v) for k, v in self.calendars.items() if k in self.meta.special]) self.render_calendar_list(parser, 'other', 'Other Events', nontourneys) # Location calendars self.render_calendar_list(parser, 'location', 'Location Calendars', self.locations) # Daily calendars self.render_calendar_list(parser, 'daily', 'Daily Calendars', self.dailies) # Special event calendars specials = { 'all-in-one': self.everything, 'tournaments': self.tournaments, } self.render_calendar_list(parser, 'special', 'Special Calendars', specials) with codecs.open(os.path.join(self.meta.output, 'index.html'), 'w', 'utf-8') as f: f.write(parser.prettify())
def MarkOne(fn, note, link): soup = None with open(fn, "r") as fd: soup = BeautifulSoup(fd) # Check if the document is already marked. if len(soup.select("div#" + DIV_ID)) > 0: logging.warning("Document '%s' is already marked.", fn) return # Build the box containing the note. note_div = soup.new_tag("div") note_div["id"] = DIV_ID note_div["style"] = DIV_STYLE note_header = soup.new_tag("p") note_header["style"] = HEADER_STYLE note_header.string = "Note:" note_div.append(note_header) note_main = soup.new_tag("p") note_main["style"] = NOTE_STYLE note_div.append(note_main) note_text = soup.new_string(note + " ") note_main.append(note_text) note_link = soup.new_tag('a') note_link["href"] = link note_link.string = "Goto the latest version" note_main.append(note_link) note_final_dot = soup.new_string(".") note_main.append(note_final_dot) # Selector should lead to a single node. done = False for selector in SELECTORS: node_lst = soup.select(selector) if not done and len(node_lst) == 1: node_lst[0].insert_after(note_div) done = True if not done: if soup.body: soup.body.insert(0, note_div) else: raise Exception( "Unable to find a place to insert note in '%s'." % fn) with codecs.open(fn, "w", "utf-8") as fd: fd.write(unicode(soup))
def MarkOne(fn, note, link): soup = None with open(fn, "r") as fd: soup = BeautifulSoup(fd) # Check if the document is already marked. if len(soup.select("div#" + DIV_ID)) > 0: logging.warning("Document '%s' is already marked.", fn) return # Build the box containing the note. note_div = soup.new_tag("div") note_div["id"] = DIV_ID note_div["style"] = DIV_STYLE note_header = soup.new_tag("p") note_header["style"] = HEADER_STYLE note_header.string = "Note:" note_div.append(note_header) note_main = soup.new_tag("p") note_main["style"] = NOTE_STYLE note_div.append(note_main) note_text = soup.new_string(note + " ") note_main.append(note_text) note_link = soup.new_tag("a") note_link["href"] = link note_link.string = "Goto the latest version" note_main.append(note_link) note_final_dot = soup.new_string(".") note_main.append(note_final_dot) # Selector should lead to a single node. done = False for selector in SELECTORS: node_lst = soup.select(selector) if not done and len(node_lst) == 1: node_lst[0].insert_after(note_div) done = True if not done: if soup.body: soup.body.insert(0, note_div) else: raise Exception("Unable to find a place to insert note in '%s'." % fn) with codecs.open(fn, "w", "utf-8") as fd: fd.write(unicode(soup))
def markup_gloss_abbrs(soup, string): for i, abbr in enumerate(string.split('.')): if i > 0: yield soup.new_string('.') atom = abbr.strip().upper() m = re.match('(1|2|3)(?P<atom>SG|PL)', atom) if atom in ABBRS or m: if m: atom = m.group('atom') span = soup.new_tag('span', **{'class': 'hint--bottom', 'data-hint': ABBRS[atom]}) span.string = abbr yield span else: yield soup.new_string(abbr)
def replace_ref_tokens(sp: BeautifulSoup, el: bs4.element.Tag, ref_map: Dict): """ Replace all references in element with special tokens :param sp: :param el: :param ref_map: :return: """ # replace all citations with cite keyword for cite in el.find_all('cit'): try: target = cite.ref.get('target').replace('bid', 'BIBREF') cite.replace_with(sp.new_string(f" {target} ")) except AttributeError: print('Attribute error: ', cite) continue # replace all non citation references for rtag in el.find_all('ref'): try: if rtag.get('target') and not rtag.get('target').startswith('bid'): if rtag.get('target').startswith('cid'): target = rtag.get('target').replace('cid', 'SECREF') elif rtag.get('target').startswith('uid'): if rtag.get('target').replace('uid', 'FIGREF') in ref_map: target = rtag.get('target').replace('uid', 'FIGREF') elif rtag.get('target').replace('uid', 'TABREF') in ref_map: target = rtag.get('target').replace('uid', 'TABREF') elif rtag.get('target').replace('uid', 'EQREF') in ref_map: target = rtag.get('target').replace('uid', 'EQREF') elif rtag.get('target').replace('uid', 'FOOTREF') in ref_map: target = rtag.get('target').replace('uid', 'FOOTREF') elif rtag.get('target').replace('uid', 'SECREFU') in ref_map: target = rtag.get('target').replace('uid', 'SECREFU') else: target = rtag.get('target').upper() else: print('Weird ID!') target = rtag.get('target').upper() rtag.replace_with(sp.new_string(f" {target} ")) except AttributeError: print('Attribute error: ', rtag) continue return el
def postprocess(self, content): pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I) comment = '' mt = pn.search(content) url = mt.group(1) if mt else None if url: opener = URLOpener(url, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: comment = result.content.decode(self.feed_encoding) except UnicodeDecodeError: return content pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I) mt = pn.search(comment) if mt: comment_json = mt.group(1) j = json.loads(comment_json) soup = BeautifulSoup(content, "lxml") for c in j['comments']: u = c['user']['screen_name'] t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text'])) for img in t.find_all('img', alt=True): img.replace_with(t.new_string(img['alt'])) soup.html.body.append(t.p) content = unicode(soup) return content
def compile_text_to_html(text_str, imgs): """ Compile text file to html. All text will be included in one <p> tag, and any images will be appended to the end of the <body> tag. All newlines will be replaced with <br /> tags. """ soup = BeautifulSoup() soup.append(soup.new_tag('html')) body_tag = soup.new_tag('body') soup.html.append(body_tag) p_tag = soup.new_tag('p') body_tag.append(p_tag) br_arr = [[soup.new_tag('br'), soup.new_string(line)] for line in text_str.split('\n')] flattened_arr = [val for sublist in br_arr for val in sublist][1:] for el in flattened_arr: p_tag.append(el) for img in imgs: img_tag = soup.new_tag('img', src='cid:%s' % img["tag"], style="max-width: 100%") body_tag.append(img_tag) return str(soup)
async def handle_content(self, content): soup = BeautifulSoup(content, 'html.parser') if self.no_dorks is not True: for p_elem in soup.find_all('p'): if p_elem.findChildren(): continue css = None if 'style' in p_elem.attrs: css = cssutils.parseStyle(p_elem.attrs['style']) text_list = p_elem.text.split() p_new = soup.new_tag('p', style=css.cssText if css else None) for idx, word in enumerate(text_list): # Fetch dorks if required if len(self.dorks) <= 0: self.dorks = await self.get_dorks() word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href=self.dorks.pop(), style= 'color:{color};text-decoration:none;cursor:text;'. format(color=css.color if css and 'color' in css.keys() else '#000000')) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = soup.encode('utf-8') return content
def postprocess(self, content): pn = re.compile(ur'<a href="(\S*?)">本话题在雪球有.*?条讨论,点击查看。</a>', re.I) mt = pn.search(content) url = mt.group(1) if mt else None if url: opener = URLOpener(url, timeout=self.timeout) result = opener.open(url) if result.status_code == 200 and result.content: if self.feed_encoding: try: comment = result.content.decode(self.feed_encoding) except UnicodeDecodeError: return content pn = re.compile(r'SNB.data.goodComments\ =\ ({.*?});', re.S | re.I) mt = pn.search(comment) comment_json = mt.group(1) if mt else None j = json.loads(comment_json) soup = BeautifulSoup(content, "lxml") for c in j['comments']: u = c['user']['screen_name'] t = BeautifulSoup('<p>@%s:%s</p>' % (u, c['text'])) for img in t.find_all('img', alt=True): img.replace_with(t.new_string(img['alt'])) soup.html.body.append(t.p) content = unicode(soup) return content
async def pathfinder(self, ctx, *, spell): """ Retrieves information about a spell in pathfinder. """ spell_data = conn.execute("SELECT * FROM spells WHERE name LIKE ?", (spell,)).fetchone() if spell_data: desc_soup = BeautifulSoup(spell_data["description_formated"], "html.parser") for p in desc_soup.findAll("p"): p.insert_after(desc_soup.new_string("\n\n")) desc = desc_soup.get_text() output = ( "**{name}**\n\n" "**School** {school}; **Level** {spell_level}\n\n" "**Casting Time** {casting_time}\n**Components** {components}\n\n" "**Range** {range}\n**Target** {targets}\n**Duration** {duration}\n" "**Saving Throw** {saving_throw}; **Spell Resistance** {spell_resistence}\n\n".format(**spell_data) ) await ctx.send(output + desc) else: await ctx.send("I didn't find anything!")
def test_insert_before_something_empty(self): soup = BeautifulSoup("") tag = soup.new_tag("a") string = soup.new_string("") self.assertRaises(ValueError, string.insert_before, tag) self.assertRaises(NotImplementedError, soup.insert_before, tag) self.assertRaises(ValueError, tag.insert_before, tag)
def test_insert_after_something_that_has_no_meaning(self): soup = BeautifulSoup("") tag = soup.new_tag("a") string = soup.new_string("") self.assertRaises(ValueError, string.insert_after, tag) self.assertRaises(NotImplementedError, soup.insert_after, tag) self.assertRaises(ValueError, tag.insert_after, tag)
def handle_html_content(self, content): soup = BeautifulSoup(content, 'html.parser') for p_elem in soup.find_all('p'): css = None if 'style' in p_elem.attrs: css = cssutils.parseStyle(p_elem.attrs['style']) text_list = p_elem.text.split() p_new = soup.new_tag('p', style=css.cssText if css else None) for idx, word in enumerate(text_list): if len(self.dorks) <= 0: self.dorks = yield from self.get_dorks() word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href=self.dorks.pop(), style='color:{color};text-decoration:none;cursor:text;'.format( color=css.color if css and 'color' in css.keys() else '#000000' ) ) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = soup.encode('utf-8') return content
def extract_formulas_from_tei_xml(sp: BeautifulSoup) -> None: """ Replace all formulas with the text :param sp: :return: """ for eq in sp.find_all('formula'): eq.replace_with(sp.new_string(eq.text.strip()))
def parse_date_html(html_string): """Takes a string that contains html, and returns (date, date_string, content) as a tuple. For now, date is an int that represents the year. Negative numbers are B.C. and positive are A.D. years. If there is no date that can be parsed, returns None. """ # preprocess to add newlines after <br />, or else get_text smushes things # together soup = BeautifulSoup(html_string) for el in soup.descendants: if el.name == 'br': el.insert_after(soup.new_string('\n')) el.insert_before(soup.new_string('\n')) html_splitter = HtmlSplitter(unicode(soup)) s = html_splitter.text_string content_offset = 0 # strip out all non-letter/digit characters from the beginning m = re.search('^[^\d\w]+', s) if m: content_offset += m.end() if not s: return None # get the date extract = parse_date_text(s[content_offset:]) if not extract: return None (date, date_index) = extract date_string = html_splitter.get_span(content_offset, date_index + content_offset) content_offset += date_index # strip out any transition characters between the date and the content m = re.search(u'^[\s\-–—:\.]+', s[content_offset:]) if m: content_offset += m.end() content = '' if content_offset >= len(s) \ else html_splitter.get_span(content_offset, len(s)) return (date, date_string, content)
def markup_gloss_abbrs(soup, string): for i, abbr in enumerate(string.split('.')): if i > 0: yield soup.new_string('.') atom = abbr.strip().upper() m = re.match('(1|2|3)(?P<atom>SG|PL)', atom) if atom in ABBRS or m: if m: atom = m.group('atom') span = soup.new_tag( 'span', **{ 'class': 'hint--bottom', 'data-hint': ABBRS[atom] }) span.string = abbr yield span else: yield soup.new_string(abbr)
def urlize_html(html, trim_url_limit=40): """will urlize html, while ignoring link patterns inside anchors, <pre> and <code> tags """ soup = BeautifulSoup(html, 'html5lib') extract_nodes = list() for node in soup.findAll(text=True): parent_tags = [p.name for p in node.parents] skip_tags = ['a', 'img', 'pre', 'code'] if set(parent_tags) & set(skip_tags): continue #bs4 is weird, so we work around to replace nodes #maybe there is a better way though urlized_text = urlize(node, trim_url_limit=trim_url_limit) if unicode(node) == urlized_text: continue sub_soup = BeautifulSoup(urlized_text, 'html5lib') contents = sub_soup.find('body').contents num_items = len(contents) for i in range(num_items): #there is strange thing in bs4, can't iterate #as the tag seemingly can't belong to >1 soup object child = contents[0] #always take first element #insure that text nodes are sandwiched by space have_string = (not hasattr(child, 'name')) if have_string: node.insert_before(soup.new_string(' ')) node.insert_before(child) if have_string: node.insert_before(soup.new_string(' ')) extract_nodes.append(node) #extract the nodes that we replaced for node in extract_nodes: node.extract() result = unicode(soup.find('body').renderContents(), 'utf8') if html.endswith('\n') and not result.endswith('\n'): result += '\n' return result
def urlize_html(html, trim_url_limit=40): """will urlize html, while ignoring link patterns inside anchors, <pre> and <code> tags """ soup = BeautifulSoup(html, 'html5lib') extract_nodes = list() for node in soup.findAll(text=True): parent_tags = [p.name for p in node.parents] skip_tags = ['a', 'img', 'pre', 'code'] if set(parent_tags) & set(skip_tags): continue # bs4 is weird, so we work around to replace nodes # maybe there is a better way though urlized_text = urlize(node, trim_url_limit=trim_url_limit) if unicode(node) == urlized_text: continue sub_soup = BeautifulSoup(urlized_text, 'html5lib') contents = sub_soup.find('body').contents num_items = len(contents) for i in range(num_items): # there is strange thing in bs4, can't iterate # as the tag seemingly can't belong to >1 soup object child = contents[0] # always take first element # insure that text nodes are sandwiched by space have_string = (not hasattr(child, 'name')) if have_string: node.insert_before(soup.new_string(' ')) node.insert_before(child) if have_string: node.insert_before(soup.new_string(' ')) extract_nodes.append(node) # extract the nodes that we replaced for node in extract_nodes: node.extract() result = unicode(soup.find('body').renderContents(), 'utf8') if html.endswith('\n') and not result.endswith('\n'): result += '\n' return result
def format_spaces(text): soup = BeautifulSoup(text) for tagstring in list(soup.strings): value = tagstring.replace(' ', '').replace(SPACE_TAG, ' ') new_tag = soup.new_string(value) tagstring.replace_with(new_tag) if new_tag == '': new_tag.extract() return soup, tag_to_text(soup.body).replace(' '+SPACE_TAG+' ', ' ')
def generate_html(): """ Generate the dictionary as html. """ soup = BeautifulSoup() table = soup.new_tag('table') table['class'] = 'table table-striped' with open(dict_file) as f: dict = csv.reader(f) # headers tr = soup.new_tag('tr') th = soup.new_tag('th') th.append(soup.new_string('英語 / English')) th['class'] = 'col-xs-2' tr.append(th) th = soup.new_tag('th') th.append(soup.new_string('日本語 / Japanese')) th['class'] = 'col-xs-2' tr.append(th) th = soup.new_tag('th') th.append(soup.new_string('ローマ字 / Rōmaji')) th['class'] = 'col-xs-1' tr.append(th) table.append(tr) for words in dict: tr = soup.new_tag('tr') for word in words: td = soup.new_tag('td') td.append(soup.new_string(word)) tr.append(td) table.append(tr) soup.append(table) with open('template.html') as f: template = f.read() with open('html/index.html', 'w') as f: html = template.format(table=soup.prettify(), size=path.getsize(dict_file) // 1000) f.write(html)
def urlize_html(html): """will urlize html, while ignoring link patterns inside anchors, <pre> and <code> tags """ soup = BeautifulSoup(html, "html5lib") extract_nodes = list() for node in soup.findAll(text=True): parent_tags = [p.name for p in node.parents] skip_tags = ["a", "img", "pre", "code"] if set(parent_tags) & set(skip_tags): continue # bs4 is weird, so we work around to replace nodes # maybe there is a better way though urlized_text = urlize(node) if unicode(node) == urlized_text: continue sub_soup = BeautifulSoup(urlized_text, "html5lib") contents = sub_soup.find("body").contents num_items = len(contents) for i in range(num_items): # there is strange thing in bs4, can't iterate # as the tag seemingly can't belong to >1 soup object child = contents[0] # always take first element # insure that text nodes are sandwiched by space have_string = not hasattr(child, "name") if have_string: node.insert_before(soup.new_string(" ")) node.insert_before(child) if have_string: node.insert_before(soup.new_string(" ")) extract_nodes.append(node) # extract the nodes that we replaced for node in extract_nodes: node.extract() result = unicode(soup.find("body").renderContents(), "utf8") if html.endswith("\n") and not result.endswith("\n"): result += "\n" return result
def typograf(html): """ Удаление висячих предлогов """ soup = Soup(html, 'html5lib') for tag in soup.findAll(text=True): if re_nbsp.search(tag): new_tag = soup.new_string(unescape(_typograf_replace(tag))) tag.replace_with(new_tag) return soup.body.decode_contents().replace('\xa0', ' ')
def get_xml(base_xxx, db_package): #psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) #psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY) initiate_threaded_connection_pool(db_package) with getconnection() as conn: cursor = conn.cursor() cursor.execute("SELECT id, nom FROM optin_list WHERE abreviation = %s", (str(base_xxx), )) records = cursor.fetchone() if records: optin_id = records[0] nom = records[1] else: optin_id = '0' nom = "" cursor.execute( "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s", (str(optin_id), 'header')) records = cursor.fetchone()[0] if records: header = records else: header = "" cursor.execute( "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s", (str(optin_id), 'footer')) records = cursor.fetchone()[0] if records: footer = records else: footer = "" conn_pool.closeall() post_dict = {} post_dict['id'] = '1' post_dict['nom'] = nom post_dict['header'] = header post_dict['footer'] = footer xml_doc = BeautifulSoup(features='xml') xml_doc.append(xml_doc.new_tag("bases")) xml_doc.bases.append(xml_doc.new_tag("base")) cpt_content = 0 for key, value in post_dict.iteritems(): xml_doc.bases.base.append(xml_doc.new_tag(str(key))) xml_container = xml_doc.bases.base.contents[cpt_content] if key == 'footer': xml_formatted_value = "<![CDATA[" + value + "]]>" else: xml_formatted_value = value xml_container.append(xml_doc.new_string(xml_formatted_value)) cpt_content += 1 xml_feed = xml_doc.prettify() xml_feed = xml_feed.replace("<", "<").replace( ">", ">") #.replace("<p>", "").replace("</p>", "") return xml_feed
def _add_title_tag(soup: BeautifulSoup) -> Tag: if title.level == 0: new_tag = soup.new_tag( ODFXMLTagNames.TEXT_P.value, attrs={ODFXMLAttributes.STYLE_NAME.value: 'Title'}) else: new_tag = soup.new_tag( ODFXMLTagNames.TEXT_H.value, attrs={ODFXMLAttributes.TITLE_LEVEL.value: str(title.level)}) new_tag.append(soup.new_string(title.text)) return new_tag
def _add_tag(soup: BeautifulSoup) -> Tag: new_tag = soup.new_tag( ODFXMLTagNames.TABLE_CELL.value, attrs={ ODFXMLAttributes.TABLE_ROW_SPAN.value: str(cell.rowspan), ODFXMLAttributes.TABLE_COL_SPAN.value: str(cell.colspan), }, ) p_tag = soup.new_tag(ODFXMLTagNames.TEXT_P.value) p_tag.append(soup.new_string(cell.content.text)) new_tag.append(p_tag) return new_tag
def use_bs4(): soup = BeautifulSoup(xml, 'lxml') from bs4 import Comment x = soup.new_string('xxx', Comment) soup.a.append(x) print soup #print soup.a.clear(True) soup.root.unwrap() print '-' * 10 print soup #help(soup) help(soup.a)
def handle_request(self, request, payload): header = {key: value for (key, value) in request.headers.items()} data = dict( method=request.method, path=request.path, headers=header ) r = yield from aiohttp.post('http://localhost:8090/event', data=json.dumps(data)) ret = yield from r.text() print(ret) response = aiohttp.Response( self.writer, 200, http_version=request.version ) base_path = '/'.join(['/opt/snare/pages', self.run_args.page_dir]) parsed_url = urlparse(unquote(request.path)) path = '/'.join( [base_path, parsed_url.path[1:]] ) path = os.path.normpath(path) if os.path.isfile(path) and path.startswith(base_path): with open(path, 'rb') as fh: content = fh.read() content_type = mimetypes.guess_type(path)[0] if content_type: if 'text/html' in content_type: print(content_type) soup = BeautifulSoup(content, 'html.parser') for p_elem in soup.find_all('p'): text_list = p_elem.text.split() p_new = soup.new_tag('p', style='color:#000000') for idx, word in enumerate(text_list): word += ' ' if idx % 5 == 0: a_tag = soup.new_tag( 'a', href='http://foo.com', style='color:#000000;text-decoration:none;cursor:text;' ) a_tag.string = word p_new.append(a_tag) else: p_new.append(soup.new_string(word)) p_elem.replace_with(p_new) content = str(soup).encode('utf-8') # print(repr(content)) response.add_header('Content-Type', content_type) response.add_header('Content-Length', str(len(content))) response.send_headers() response.write(content) else: response.status = 404 response.send_headers() yield from response.write_eof()
def output_journal(self, journal: Journal): soup = BeautifulSoup('', 'html5lib') body = soup.find('body') # Add title and distance statement self.output_title(soup, body, journal.journal_title) self.output_subtitle(soup, body, journal.journal_subtitle) self.output_subtitle(soup, body, 'By {0}'.format(journal.journal_author)) locations = [location for location in journal.locales] self.output_para(soup, body, 'Locations: {0}'.format(', '.join(locations))) if journal.cover_image: self.output_picture(soup, body, journal.cover_image) toc_div = soup.new_tag('div', attrs={'class': 'toc_container'}) # Iterate over the ToC and process every page page_idx = 1 for toc_item in journal.toc: p_tag = soup.new_tag('p') toc_tag = p_tag if toc_item.page: html_filename = self.output_page(toc_item.page, page_idx) toc_tag = soup.new_tag('a', attrs={'href': html_filename}) p_tag.append(toc_tag) toc_tag.append(soup.new_string(toc_item.page.title)) page_idx += 1 elif toc_item.subtitle: toc_tag.append(soup.new_string(toc_item.subtitle)) toc_div.append(p_tag) body.append(toc_div) self.output_html(soup, 'index')
def put_to_html(articles): if not config['template_input_html'] or not config['output_html_path']: return False path_input = config['template_input_html'] path_output = config['output_html_path'] # 加载输入HTML模板 fin = open(path_input, 'r', encoding='UTF-8') # 加载输出模板 fout = open(path_output, 'w+', encoding='UTF-8') soup = BeautifulSoup(fin.read(), 'lxml') body = soup.find('body') # 构造文章 div 节点 for article in articles: # 文章标题节点 title_node = soup.new_tag('h2') title = soup.new_string(article['title']) title_node.append(title) # 图片节点 div_image_node = soup.new_tag('div', {'class': 'image'}) # 神秘代码节点 div_magnet_node = soup.new_tag('div', {'class': 'magnet'}) div_node = soup.new_tag('div', {'class': 'article'}) for image in article['images']: image_node = soup.new_tag('img', src=image) div_image_node.append(image_node) for magnet in article['magnet']: magnet_string = soup.new_string(magnet) magnet_node = soup.new_tag('p') magnet_node.append(magnet_string) div_magnet_node.append(magnet_node) # 依次将文章节点添加到模板页面的 body 中 div_node.append(title_node) div_node.append(div_magnet_node) div_node.append(div_image_node) body.append(div_node) # 将HTML页面格式化输出到本地 fout.write(soup.prettify())
def cut_bloc2bs_elt(cut_bloc_res) : bs = BeautifulSoup('') if cut_bloc_res[0] == 'str' : return bs.new_string(cut_bloc_res[1]) name = cut_bloc_res[0]['name'] new_bs_elt = bs.new_tag(name) new_bs_elt.attrs = {} for k,v in cut_bloc_res[0].iteritems() : if k == 'name' : continue new_bs_elt.attrs[k] = v for bloc in cut_bloc_res[1] : new_bs_elt.append(cut_bloc2bs_elt(bloc)) return new_bs_elt
def format_spaces(content): if not isinstance(content, BeautifulSoup): soup = BeautifulSoup(content, "lxml") else: soup = content for tagstring in list(soup.strings): value = tagstring.replace(' ', '').replace(SPACE_TAG, ' ') new_tag = soup.new_string(value) tagstring.replace_with(new_tag) if new_tag == '': new_tag.extract() return soup, tag_to_text(soup.body)
def process_footnotes_from_text(sp: BeautifulSoup) -> Dict: """ Process footnote marks :param sp: :return: """ footnote_map = dict() for note in sp.find_all('note'): try: if note.name and note.get('id'): # normalize footnote id ref_id = note.get('id').replace('uid', 'FOOTREF') # remove equation tex for eq in note.find_all('texmath'): eq.decompose() # replace all xrefs with link for xref in note.find_all('xref'): xref.replace_with(sp.new_string(f" {xref.get('url')} ")) # clean footnote text footnote_text = None if note.text: footnote_text = note.text.strip() footnote_text = re.sub(r'\s+', ' ', footnote_text) footnote_text = re.sub(r'\s', ' ', footnote_text) # form footnote entry footnote_map[ref_id] = { "num": note.get('id-text', None), "text": footnote_text, "ref_id": ref_id } note.replace_with(sp.new_string(f" {ref_id} ")) except AttributeError: continue return footnote_map
def _wrap_content(title, content_node, wrap): if wrap: doc = BeautifulSoup( ''' <html> <head> <title><title/> </head> <body></body> </html>''') doc.title.append(doc.new_string(title)) doc.body.append(content_node) return doc else: return content_node
def process_formulas_in_paragraph(para_el: BeautifulSoup, sp: BeautifulSoup) -> None: """ Process all formulas in paragraph and replace with text and label :param para_el: :param sp: :return: """ for ftag in para_el.find_all('formula'): # get label if exists and insert a space between formula and label if ftag.label: label = ' ' + ftag.label.text ftag.label.decompose() else: label = '' ftag.replace_with(sp.new_string(f'{ftag.text.strip()}{label}'))
def _extract_tags( soup: BeautifulSoup, tag_finder: Callable[[BeautifulSoup], List[Tag]] ) -> Tuple[BeautifulSoup, Dict[str, Tag]]: soup = _copy_soup(soup) tags = tag_finder(soup) reference_to_tag: Dict[str, Tag] = {} for tag in tags: ref = _generate_reference() str_ = soup.new_string(ref) extracted = tag.replace_with(str_) if not extracted: raise ValueError('Expecting Tag, not None.') reference_to_tag[ref] = extracted str_.wrap(soup.new_tag('w:r')) return soup, reference_to_tag
def writexml(xml): file = open('student.xml', 'w') xml = BeautifulSoup(open('student.xml'), 'xml', from_encoding='utf-8') root = xml.new_tag('root') xml.append(root) roots = xml.root student = xml.new_tag('students') roots.append(student) students = roots.students comment = xml.new_string('\n学生信息表\n"id":[名字,数学,语文,英文]\n', Comment) student.append(comment) student.append(xml) i = xml.prettify() file.write(i) file.close()
def __call__(self, outdir): """ runs a parser workflow consisting of - preprocess - refactor - postprocess writes the results, an html, a css and a json file to disk. """ cssutils_logger = logging.getLogger('CSSUTILS') cssutils_logger.setLevel(logging.ERROR) print(self.fname.namebase.encode('utf8')) with open(self.fname, encoding='utf8') as fp: c = fp.read() soup = BeautifulSoup(self.preprocess(self._preprocess(c))) # extract css from the head section of the HTML doc: css = cssutils.parseString('\n') for style in soup.find('head').find_all('style'): for rule in self.cssrules(style): css.add(rule) md = dict(outline=[], refs=[], authors=[]) soup = self.refactor(soup, md) # enhance section headings: for section, t in tag_and_text(soup.find_all('h3')): t = t.split('[Note')[0] id_ = 'section-%s' % slug(t) md['outline'].append((t, id_)) section.attrs['id'] = id_ for s, attrs in [ (u'\u21eb', {'href': '#top', 'title': 'go to top of the page', 'style': 'vertical-align: bottom'}), ('¶', {'class': 'headerlink', 'href': '#' + id_, 'title': 'Permalink to this section'}), ]: append(section, soup.new_string('\n'), new_tag(soup, 'a', s, **attrs)) body = self.insert_links(unicode(soup.find('body')), md) # write output files: with open(outdir.joinpath('%s.html' % self.id), 'w', encoding='utf8') as fp: fp.write(self.wrap(self.postprocess(body))) with open(outdir.joinpath('%s.css' % self.id), 'wb') as fp: fp.write(self.csstext(css)) md['authors'] = list(self.yield_valid_authors(md['authors'])) jsondump(md, outdir.joinpath('%s.json' % self.id), indent=4)
def ExportAlgoInfo(fileName, algorithm): if not os.path.exists('./svgs/infos'): os.makedirs('./svgs/infos') file = open( "./svgs/infos/%s" % os.path.basename(fileName).replace("svg", "html"), "w") info = algorithm.About() soup = BeautifulSoup(info) # Link the CSS file head = soup.find('head') if not head: html = soup.find('html') html.insert(0, soup.new_tag('head')) head = soup.find('head') head.insert( 0, soup.new_tag('link', rel='stylesheet', href='../css/info_page_style.css')) # Insert a div with background-color found by colordef into the <dt> elements for dt in soup.find_all('dt'): colordef = dt.find('colordef') if not colordef: continue color = colordef['color'] colordef.extract() color_div = soup.new_tag('div') color_div.string = ' ' color_div['style'] = 'background-color: %s' % color color_div['class'] = 'color_div' dt.append(color_div) # Add a div with "clear: both" to make the next row for dd in soup.find_all('dd'): clear_div = soup.new_tag('div') clear_div['class'] = 'clear_div' dd.insert_after(clear_div) body = soup.find('body') if not body.contents: # If there isn't any content add "No algorithm info" body.append(soup.new_string('No algorithm info')) file.write(soup.prettify(formatter=None))
def ExportAlgoInfo(fileName, algorithm): if not os.path.exists('./svgs/infos'): os.makedirs('./svgs/infos') file = open("./svgs/infos/%s" % os.path.basename(fileName).replace("svg", "html"), "w") info = algorithm.About() soup = BeautifulSoup(info) # Link the CSS file head = soup.find('head') if not head: html = soup.find('html') html.insert(0, soup.new_tag('head')) head = soup.find('head') head.insert(0, soup.new_tag('link', rel='stylesheet', href='../css/info_page_style.css')) # Insert a div with background-color found by colordef into the <dt> elements for dt in soup.find_all('dt'): colordef = dt.find('colordef') if not colordef: continue color = colordef['color'] colordef.extract() color_div = soup.new_tag('div') color_div.string = ' ' color_div['style'] = 'background-color: %s' % color color_div['class'] = 'color_div' dt.append(color_div) # Add a div with "clear: both" to make the next row for dd in soup.find_all('dd'): clear_div = soup.new_tag('div') clear_div['class'] = 'clear_div' dd.insert_after(clear_div) body = soup.find('body') if not body.contents: # If there isn't any content add "No algorithm info" body.append(soup.new_string('No algorithm info')) file.write(soup.prettify(formatter=None))
def get_pinyin_bs(file_name): """ Return file + Ruby characters. *fn* file name to run on """ # open file and parse with bs4, with xml rules debug(file_name) f = open(file_name, 'r') s1 = f.read() f.close() bs = BeautifulSoup(s1, 'xml') # Go through all the 'text' tags and extract only the tags' strings # assign to all_of_the_tags all_of_the_tags = [] for tn in GET_TAG_NAMES: for t in bs.findAll(tn): if "href" in t: t['href'] = htmlLib.escape(t['href']) for ts in get_all_tags_text(t): all_of_the_tags.append(ts) for p in all_of_the_tags: debug(p) p_el = p['tag'] p_text = p['str'] debug(p_text) new_p_str = '' debug('WORKING ON: ') debug("#######{0}#######".format(p_text)) # pos_tagging will give you the type of word # unneccesary in this case words = [word for word in jieba.cut(p_text)] debug("WORDS: {0}".format(words)) new_p_str = generate_new_html_for_words(words) debug("Parent Element: " + str(p_el.parent)) debug("{0} will be replaced by {1}".format(p_el, new_p_str)) p_el.replace_with(bs.new_string(new_p_str)) return str(add_js_link(bs).decode(formatter=None))
base = os.path.basename(input_file) inputdir = path + input_file with open (inputdir, "r") as FA: FA_string = FA.read().replace('\n', '') #script = "<script type='text/javascript' src='http://library.albany.edu/angelfish.js'></script><script type='text/javascript'>agf.pageview();</script>" #FA_output = FA_string[:554] + script + FA_string[554:] input_string = FA_string.replace(u'\xa0', u' ') soup = Soup(input_string) title = soup.find('title') script1 = soup.new_tag('script') script1['type'] = "text/javascript" script1['src'] = "http://library.albany.edu/angelfish.js" if title is None: print base title.insert_after(script1) script2 = soup.new_tag('script') script2['type'] = "text/javascript" new_string = soup.new_string("agf.pageview();") script2.append(new_string) title.insert_after(script2) #prettyHTML=soup.prettify() output = str(soup) output_path = outputdir + base file = open(output_path, "w") file.write(output)
print(new_li_tag.prettify()) #adding/modifying string new_div_name_tag.string = "phytoplankton" print(producer_entries.prettify()) #using append new_div_name_tag.append("producer") print(soup.prettify()) #using new_string new_string_toappend = soup.new_string("producer") new_div_name_tag.append(new_string_toappend) #using insert new_string_toinsert = soup.new_string("10000") new_div_number_tag.insert(0, new_string_toinsert) print(soup.prettify()) #deleting using decompose third_producer = soup.find_all("li")[2] div_name = third_producer.div div_name.decompose() print(third_producer.prettify())
base_last = os.path.split(base_dir)[-1] index_name = os.path.join(base_dir+'/../', 'index.html') if os.path.exists(index_name): os.remove(index_name) files = glob.glob(os.path.join(base_dir, '*.html')) #ru_files = glob.glob(os.path.join(base_dir, '*.html_ru')) html_doc = """ <html><head><title>Index</title></head> <body> </body> </html> """ soup = BeautifulSoup(html_doc) for f in files: fname = os.path.basename(f) tag = soup.new_tag('p') soup.body.append(tag) a = soup.new_tag('a', href='./'+base_last+'/'+fname, target='_blank') a.append(soup.new_string(get_title(f))) tag.append(a) tag.append(soup.new_tag('br')) a = soup.new_tag('a', href='./'+base_last+'/'+fname+'_ru', target='_blank') a.append(soup.new_string(get_title(f+'_ru'))) tag.append(a) tag.append(soup.new_tag('br')) tag.append(soup.new_tag('br')) index = open(index_name, 'w+') index.write(soup.prettify(formatter="html").encode('utf-8'))
#!/usr/bin/env python3 # -*- coding: utf-8 -*- from bs4 import BeautifulSoup from bs4 import Comment from minelibs import * import pprint from html_content import * import re soup = BeautifulSoup("<b>stop</b>", 'html.parser') tag = soup.new_tag("i") tag.string = "Don't" soup.b.string.insert_before(tag) print_eval('soup.b') print_eval('soup.b.contents') print(xgreen('- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -')) soup.b.i.insert_after(soup.new_string(" ever ")) print_eval('soup.b') print_eval('soup.b.contents')
for line in soup.find_all(class_='MsoNormalTable'): #print(line.next_sibling.next_sibling) p = line.next_sibling.next_sibling new_tag_tr = soup.new_tag('tr') new_tag_tr['class'] = 'norTr' new_tag = soup.new_tag('table') new_tag['class'] = 'footTb' new_tag.append(new_tag_tr) for sub in p.stripped_strings: for s in sub.splitlines(): if len(s.strip()) > 1: #print(s+'#') new_tag_td = soup.new_tag('td', style="width:1cm;") if len(s.strip(':')) > 2: new_tag_td['style'] = "width:15mm;" new_tag_td_ip = soup.new_tag('td', style=new_tag_td['style']) pinyin = Pinyin() s = s.strip(':').strip(':').strip() pys = pinyin.get_init(s) new_tag_ip = soup.new_tag('input', id= 'ft_'+pys) new_tag_td_ip.append(new_tag_ip) new_tag_td.append(soup.new_string(s+':')) new_tag_tr.append(new_tag_td) new_tag_tr.append(new_tag_td_ip) p.replace_with(new_tag) for line in soup.find_all('p'): line.unwrap() print(soup.prettify()) f = open('prase2.htm','w',encoding='utf-8') f.write(soup.prettify())
class ConfluencePageInflater(object): def __init__(self, page_source, page_handle, attach_handle, encoding='utf-8'): super(ConfluencePageInflater, self).__init__() self.soup = BeautifulSoup(page_source, 'html5lib', from_encoding=encoding) self.page_handle = page_handle self.attach_handle = attach_handle self.cleaned_up = False def filter_image(self): for img in self.soup.find_all('img'): ac_image = self.soup.new_tag('ac:image') src = img.get('src') if src and '//' not in src: attach = self.attach_handle(src, img.get('title')) if attach: ri_resource = self.soup.new_tag('ri:attachment') ri_resource['ri:filename'] = attach['resource_name'] else: img.decompose() continue else: ri_resource = self.soup.new_tag('ri:url') ri_resource['ri:value'] = src ac_image.append(ri_resource) if img.has_attr('alt'): ac_image['ac:alt'] = img['alt'] img.replace_with(ac_image) def filter_link(self): for link in self.soup.find_all('a'): href = link.get('href') if href and '//' not in href: if '?' in href: href = href[:href.index('?')] ac_link = self.soup.new_tag('ac:link') if '#' in href: ac_link['ac:anchor'] = href[href.index('#') + 1:] href = href[:href.index('#')] if href.endswith('.html'): page = self.page_handle(href) if page: ri_resource = self.soup.new_tag('ri:page') ri_resource['ri:content-title'] = page['title'] else: link.decompose() continue else: attach = self.attach_handle(href, link.get('title')) if attach: ri_resource = self.soup.new_tag('ri:attachment') ri_resource['ri:filename'] = attach['resource_name'] else: link.decompose() continue ac_link.append(ri_resource) children = link.find_all() if children: body = self.soup.new_tag('ac:link-body') for child in children: body.append(child) elif link.text: body = self.soup.new_tag('ac:plain-text-link-body') body.append(self.soup.new_string(link.text, CData)) else: link.decompose() continue if link.has_attr('title'): ac_link['ac:title'] = link['title'] ac_link.append(body) link.replaceWith(ac_link) @property def title(self): title = self.soup.find('title') return title and title.encode_contents().strip() or '' def filter_dl(self): for dl in self.soup.find_all('dl'): ul = self.soup.new_tag('ul') dts = dl.find_all('dt') dds = dl.find_all('dd') for dt, dd in zip(dts, dds): li = self.soup.new_tag('li') dt.name = 'p' li.append(dt) dd.name = 'p' li.append(dd) ul.append(li) dl.replace_with(ul) @property def is_home_page(self): meta = self.soup.find('meta', attrs={'name': 'homepage'}) return meta is not None and meta.get('value') == 'true' def filter_code(self): for pre in self.soup.find_all('pre'): code_block = self.soup.new_tag('ac:structured-macro') code_block['ac:name'] = 'code' if pre.has_attr('data-lang'): lang_param = self.soup.new_tag('ac:parameter') lang_param['ac:name'] = 'language' lang_param.append(pre['data-lang']) code_block.append(lang_param) plain_text = self.soup.new_tag('ac:plain-text-body') plain_text.append(self.soup.new_string(pre.get_text(), CData)) code_block.append(plain_text) pre.replace_with(code_block) @property def cleaned_src(self): if not self.cleaned_up: self.cleaned_up = True self.filter_image() self.filter_link() self.filter_dl() self.filter_code() body = self.soup.find('body') return (body and body.encode_contents(formatter='html') or self.soup.encode_contents(formatter='html'))
def tugua_download(url, directory="", date=None): '''\ Download tugua of [date:datetime|str] from [url:str], and store into [directory:str]. It will create a new folder named "YYYYmmdd" and store converted file into it, and store the original html file into "src" folder. Return: None ''' # prepare source directory if (not date): date = datetime.date.today() if (isinstance(date, datetime.date)): date_str = date.strftime("%Y%m%d") else: date_str = date directory = os.path.realpath(os.path.abspath(directory)) src_dir = os.path.join(directory, config["TUGUA"]["SrcDir"]) if (not os.path.isdir(src_dir)): os.makedirs(src_dir) src_path = os.path.join(src_dir, date_str + ".html") # download contents global urlsrc url = url.strip() urlsrc = url down_url(url, src_path) data = None with open(src_path, "rb") as src_file: data = src_file.read() src = parse_html(data) dest = BeautifulSoup("", config["TUGUA"]["HtmlParser"]) # analyze source title and frame title_tag_src = src.find("title") assert (title_tag_src), "No title found!" title = title_tag_src.get_text() title_match = re.search(r"【喷嚏图卦(\d{8})】\S.*$", title) assert (title_match), "No title found!\n Title tag is '{}'.".format(title) assert (date_str == title_match.group(1)), "Date mismatch!\n Input is '{}', actual is '{}'.".format(date_str, title_match.group(1)) title = title_match.group(0).strip() start_tag_src = src.find(text=re.compile(r"以下内容,有可能引起内心冲突或愤怒等不适症状。|本文转摘的各类事件,均来自于公开发表的国内媒体报道。引用的个人或媒体评论旨在传播各种声音,并不代表我们认同或反对其观点。")) end_tag_src = src.find(text=re.compile(r"广告联系:dapenti#dapenti.com")) if (end_tag_src): tmp = end_tag_src.find_next(text=re.compile(r"喷嚏网")) if (tmp): end_tag_src = tmp while (not end_tag_src.name or end_tag_src.name == "a"): end_tag_src = end_tag_src.parent assert (start_tag_src) and (end_tag_src), "No content found!\n Start is '{}', end is '{}'.".format(start_tag_src, end_tag_src) if (not end_tag_src.next_element): src.append(dest.new_tag("end")) # construct dest frame dest.append(dest.new_tag("html")) head_tag_dest = dest.new_tag("head") charset_tag = dest.new_tag("meta") charset_tag["http-equiv"] = "Content-Type" charset_tag["content"] = "text/html; charset={}".format(config["TUGUA"]["DestEncoding"]) head_tag_dest.append(charset_tag) if (config["STYLE"]["JqueryFile"]): head_tag_dest.append(dest.new_tag("script", type="text/javascript", src=config["STYLE"]["JqueryFile"])) if (config["STYLE"]["CssFile"]): head_tag_dest.append(dest.new_tag("link", rel="stylesheet", type="text/css", href=config["STYLE"]["CssFile"])) if (config["STYLE"]["JsFile"]): head_tag_dest.append(dest.new_tag("script", type="text/javascript", src=config["STYLE"]["JsFile"])) title_tag_dest = dest.new_tag("title") title_tag_dest.string = title head_tag_dest.append(title_tag_dest) dest.html.append(head_tag_dest) body_tag_dest = dest.new_tag("body") dest.html.append(body_tag_dest) # analyze and convert subtitle_regex = re.compile(r"^【(\d{0,2})】(.*)") def stop_func(tag): if (tag == end_tag_src): return True elif (not tag) or (not tag.string): return False elif (subtitle_regex.match(tag.string.strip())): return True else: return False (prologue, curr_src) = tugua_analyze(start_tag_src, dest, stop_func=stop_func) sections = [] while True: assert (curr_src), "Unsupported Error!\n Analysis tag suspended." (section, curr_src) = tugua_analyze(curr_src, dest, stop_func=stop_func) sections.append(section) if (curr_src == end_tag_src): (last_tag, _) = tugua_analyze(curr_src, dest, search_sibling=False) if (last_tag.name == "div"): last_tag.name = "p" section.append(last_tag) # a bit tricky, append it into previous section break # debug '''debug_output("0: {}".format(prologue)) count = 0 for section in sections: count += 1 debug_output("{}: {}".format(count, section))''' # check section number number_error = 0 number_count = 0 number_delta = 0 for section in sections: number_count = number_count + 1 subtitle = section while (not isinstance(subtitle, NavigableString)): subtitle = subtitle.next_element assert (subtitle), "Content Error!\n Expect section '{}' but no text found in '{}'.".format(number_count, section) subtitle_match = subtitle_regex.match(subtitle) assert (subtitle_match), "Content Error!\n Expect subtitle '【{}】' but actual is '{}'.".format(number_count, subtitle) curr_id = subtitle_match.group(1) if (len(curr_id) > 0): curr_id = int(curr_id) else: curr_id = 0 if (curr_id != number_count and curr_id + number_delta != number_count): logger.warn("Subtitle number mismatch, expect '{}' but actual is '{}'.".format(number_count, subtitle_match.group(1))) number_error = number_error + 1 number_delta = number_count - curr_id subtitle.replace_with(dest.new_string("【{:02}】{}".format(number_count, subtitle_match.group(2).strip()))) assert (number_error <= config["CORRECTION"].getint("TitleNumErrorMax")), "Content Error!\n Too many subtitle number mismatch, totally {} errors.".format(number_error) # prepare destination directory dest_dir = os.path.join(directory, date_str) if (not os.path.isdir(dest_dir)): os.makedirs(dest_dir) os.chdir(dest_dir) # load img_info from tmp file tmp_path = os.path.join(src_dir, config["TUGUA"]["TmpFile"]) if (os.path.isfile(tmp_path)) and (os.path.getsize(tmp_path) > 0): with open(tmp_path, "rb") as tmp_file: tmp_data = pickle.loads(tmp_file.read()) else: tmp_data = {} if (date_str not in tmp_data): tmp_data[date_str] = {} img_info = tmp_data[date_str] img_info["count"] = 0 # format sections & download images try: prologue = tugua_format(prologue, dest, img_info=img_info) for index in range(len(sections)): img_info["count"] = 0 sections[index] = tugua_format(sections[index], dest, img_info=img_info, section_id="{:02}".format(index+1), has_subtitle=True) finally: # store img_info into tmp file with open(tmp_path, "wb") as tmp_file: tmp_data[date_str] = img_info tmp_file.write(pickle.dumps(tmp_data)) # separate extra, ad and epilogue tag = sections[-1] temp = [] epi_regex = re.compile(r"^(友情提示:请各位河蟹评论。道理你懂的)|(\s*喷嚏新浪围脖:\s*@\s*喷嚏官微\s*、\s*@\s*喷嚏意图\s*(新浪)\s*)$") epi = None for child in tag.children: ch = child.contents[0] if (isinstance(ch, Tag)) and ((ch.name == "img") or (ch.name == "embed")): temp.clear() elif (epi_regex.match(child.get_text())): epi = child break else: temp.append(child) assert (epi), "Content Error!\n No epilogue found in '{}'.".format(tag) extra_tag = dest.new_tag("div") ad_tag = dest.new_tag("div") if (len(temp) > 0): ad_tmp = temp[-1].extract() if (len(ad_tmp.contents) == 1) and (ad_tmp.contents[0].name == "a") and (ad_tmp.contents[0].string.startswith("http")) and (len(temp) > 1): ad_tag.append(temp[-2].extract()) ad_tag.append(ad_tmp) temp = temp[:-2] else: ad_tag.append(ad_tmp) temp = temp[:-1] for t in temp: extra_tag.append(t.extract()) epilogue_tag = dest.new_tag("div") while(epi): next_epi = epi.next_sibling epilogue_tag.append(epi.extract()) epi = next_epi prologue["id"] = config["IDENT"]["Prologue"] prologue["class"] = config["IDENT"]["Prologue"] extra_tag["id"] = config["IDENT"]["Extra"] extra_tag["class"] = config["IDENT"]["Extra"] ad_tag["id"] = config["IDENT"]["Ad"] ad_tag["class"] = config["IDENT"]["Ad"] epilogue_tag["id"] = config["IDENT"]["Epilogue"] epilogue_tag["class"] = config["IDENT"]["Epilogue"] # generate title title_tag = dest.new_tag("div") title_tag["id"] = config["IDENT"]["Title"] title_tag["class"] = config["IDENT"]["Title"] title_tag.append(dest.new_tag("p")) title_tag.p.append(dest.new_tag("a")) title_tag.p.a["href"] = url title_tag.p.a.string = title # regroup body_tag_dest.append(title_tag) body_tag_dest.append(prologue) for section in sections: body_tag_dest.append(section) body_tag_dest.append(extra_tag) body_tag_dest.append(ad_tag) body_tag_dest.append(epilogue_tag) #dest_path = os.path.join(dest_dir, "{}.html".format(title)) dest_path = os.path.join(dest_dir, config["TUGUA"]["DestFile"]) with open(dest_path, "wb") as dest_file: logger.info("Saving file '{}' ...".format(dest_path)) dest_file.write(dest.prettify().encode(config["TUGUA"]["DestEncoding"])) # delete tmp record when complete del tmp_data[date_str] with open(tmp_path, "wb") as tmp_file: tmp_file.write(pickle.dumps(tmp_data)) urlsrc = None return
#! /usr/bin/python #encoding=utf-8 from bs4 import BeautifulSoup '''演示如何添加一段字符串''' html_doc = '<b></b>' soup = BeautifulSoup(html_doc) tag = soup.b tag.append("hello") new_string = soup.new_string(" python") tag.append(new_string) print tag print tag.contents
# print cjcs.prettify() for p in cjcs.find_all('p'): if p.find('span', class_='yxs'): num = int(p.find('span', class_='yxs').string.split('.')[0]) p.find('span', class_='yxs').decompose() print num for match in p.find_all('a'): match.replaceWithChildren() for child in p.children: if child.string: s = child.string li = [] wrapper = soup.new_tag('div', **{'class':'temp'}) for i, cont in enumerate(s.split(' ')): if i % 2 == 0: li.append(soup.new_string(cont)) else: t = soup.new_tag('div', **{'class':'name'}) t.string = cont li.append(t) for t in li: wrapper.append(t) child.replace_with(wrapper) else: for i, cont in enumerate(child.children): if i % 2 != 0: t = soup.new_tag('div', **{'class':'name'}) t.string = cont cont.replace_with(t) for match in p.find_all('div', class_='temp'): match.replaceWithChildren()
def publish(fname, full=True): cxn = sqlite3.connect( os.path.join(config_dir, 'fbk_cache.db') ) cur = cxn.cursor() local_tz = get_localzone() sql_fetch_query = """SELECT `fbk_id`,`message`,`created_timestamp`,`privacy_description` FROM `posts` WHERE `privacy_description`='Public' AND `type`='status' ORDER BY `created_timestamp` DESC""" cur.execute(sql_fetch_query) soup = BeautifulSoup( """<html><head><title>%s — Wall</title> <meta charset="utf-8"> <link rel="stylesheet" href="style.css" type="text/css"> </head> <body><table id="main"><thead /><tfoot /><tbody /></table></body></html>""" % obj_config['name'], "html.parser") body = soup.find('body') h1 = soup.new_tag('h1') h1.string = obj_config['name'] if obj_config['tagline']: span = soup.new_tag('span') span['id'] = "tagline" span.string = obj_config['tagline'] h1.append(span) body.append(h1) main_body = BeautifulSoup( '<div id="content" />', "html.parser" ) main = main_body.find(id='content') for post in cur.fetchall(): p = transform(post) soup_post = BeautifulSoup("""<div class="feedentry hentry" id="fb_%s"> <span class="author vcard"><span class="fn profile">%s</span></span> <span class="entry-title entry-content">%s</span> <div class="timerow"> <time class="time published" title="%s" data-date="%s"> %s </time> </div> </div>""" % (p['fbk_id'], obj_config['name'], p['message'], p['created_timestamp'], p['date'], p['sanitized_timestamp']), "html.parser" ) main.append(soup_post) now = datetime.now(local_tz) fbk_util_comment = soup.new_string("Generated by fbk_utils %s" % now.isoformat(), Comment) main.append(fbk_util_comment) if full: body.append(main_body) write_outfile( soup.prettify(), '.', 'wall-full.html' ) else: write_outfile( main_body.prettify(), '.', 'wall-posts.html' ) cxn.close() return
def xkcdify(content): """ Replace text within a string as specified by the xkcd Substitutions comics. This takes an HTML fragment and replaces the text accordingly, wrapping the resulting substitutions in span tags. :param content: Original content with text to be replaced. :returns: Resulting content after xkcd substitutions. """ def sub(matchobj): match = matchobj.group() key = match.lower().replace("-", " ") key1 = re.escape(key) key2 = re.escape(key.rstrip("'s")) # First, check if the match has a substitution. # If it doesn't, check as if the match were plural or possessive. if key1 in subs: result = subs[key1] elif key2 in subs: result = subs[key2] # If the pattern encountered a match that's the plural or # possessive form of a key, modify the return value accordingly. if match.endswith("s"): result = result + "s" elif match.endswith("'"): result = result + "'" else: return "" return result # Get all the plain text strings in the document without their tags. soup = BeautifulSoup(content, 'html.parser') content_strings = [element for element in soup.recursiveChildGenerator() \ if type(element) == NavigableString] for string in content_strings: # Use index to track where the current substring of plain text starts. index = 0 # Use wrapper to string together plain text and span elements. wrapper_tag = soup.new_tag('span') # Upon each match, write to the wrapper the substitution result and the # plain text preceding it. Then update index to the position after the # matched substring to mark the start of the next plain text substring. for match in pattern.finditer(string): wrapper_tag.append(soup.new_string(string[index:match.start()])) replacement = soup.new_tag('span', **{ 'class': 'substitution', 'data-tooltip': match.group() }) replacement.string = sub(match) if replacement.string: wrapper_tag.append(replacement) else: wrapper_tag.append(soup.new_string(match.group())) index = match.end() # Keep the original plain text unless substitutions were made. if wrapper_tag.contents: # Only append the rest of the string if substitutions were made, # because we would otherwise be left with the full original string. wrapper_tag.append(string[index:]) string.replace_with(wrapper_tag) wrapper_tag.unwrap() return unicode(soup)
def sanitise_html(text, is_html): if not is_html: # Plain text - generate HTML soup = BeautifulSoup() paras = text.split('\n') for para in paras: # Skip empty paragraphs if re.search(r'\S', para) is None: continue tag = soup.new_tag("p") # Attempt to make links and add text to the tag while True: mo = re.search(r'http://\S+', para) if mo is None: # no links found - add remaining text to tag and finish tag.append(soup.new_string(para)) break # Add text before link (if any) as string if mo.start() > 0: tag.append(soup.new_string(para[:mo.start()])) # Strip final punctuation off link target, if applicable if re.match(r'.*[.,;/()]$', mo.group(0)) is not None: link_href = para[ mo.start() : mo.end() - 1] para = para[ mo.end() - 1:] else: link_href = mo.group(0) para = para[ mo.end() :] link_tag = soup.new_tag("a", href=link_href) if len(link_href) <= 25: link_tag.append(link_href) else: link_tag.append(link_href[:22] + '...') tag.append(link_tag) soup.append(tag) else: # HTML - store sanitized HTML in the database blacklist = ['script', 'style'] whitelist = { 'a' : ['href'], 'p' : None, 'div' : None, 'span' : None, 'br' : None, 'table' : None, 'tr' : None, 'td' : None, 'th' : None, 'thead' : None, 'tbody' : None, 'ul' : None, 'ol' : None, 'li' : None, 'b' : None, 'strong' : None, 'i' : None, 'em' : None, 'u' : None, 'strike' : None, } soup = BeautifulSoup(text) for tag in soup.findAll(): if tag.name.lower() in blacklist: # remove including all children tag.extract() elif tag.name.lower() not in whitelist: # remove, retaining children tag.unwrap() else: # remove disallowed attributes permitted_attrs = whitelist[tag.name.lower()] for attr in tag.attrs: if permitted_attrs is None or attr not in permitted_attrs: del tag.attrs[attr] return soup.decode(formatter='html')