def process_html(content: bytes, req_context: RequestContext) -> str: soup = BeautifulSoup(content, 'html.parser') if not soup.head: soup.insert(0, soup.new_tag('head')) normalize_url_module = soup.new_tag('script') normalize_url_module.string = normalizeUrlScript soup.head.insert(0, normalize_url_module) jsRequestHandler = soup.new_tag('script') jsRequestHandler.string = jsRequestHandlerScript soup.head.insert(0, jsRequestHandler) for searchObj in linkAttr: for tag in soup.findAll(**searchObj): for attr in searchObj: tag[attr] = spoof_url(tag[attr], req_context) if tag.name == 'img': if tag.get('srcset'): del tag['srcset'] for style in soup.findAll('style'): if style.string == None: style.string = '' style.string = process_css(style.string, req_context) return str(soup)
def render(self): document = [self._get_global_style(), self._get_header()] keywords = defaultdict(lambda: defaultdict(str)) for k, group in self.template_groups.items(): for item in group: keywords.update(item.for_template()) for k, group in self.template_groups.items(): conclusion = [] for item in group: document.append(item.template.body.format(**keywords)) conclusion.append(item.template.conclusion) conclusion = ' '.join(conclusion) if BeautifulSoup(conclusion, 'html.parser').text: conclusion = BeautifulSoup(conclusion, 'html.parser') for p in conclusion.find_all('p'): p.name = 'span' conclusion.insert( 0, BeautifulSoup(options.CONCLUSION, 'html.parser')) document.append(str(conclusion)) document.append(self._get_footer()) return ''.join(document)
def get_svg(xml,sketchID,version): root = ET.fromstring(xml) result_soup = BeautifulSoup() for kobject in root.findall('.//KObject'): objectID = kobject.attrib['id'] parent = kobject.find('parent') parentID = parent.attrib['id'] stroke = kobject.find('strokeData') if stroke is not None: path = ksketchsvg.get_polyline(stroke) color = ksketchsvg.convert_color(stroke.attrib['color']) thickness = stroke.attrib['thickness'] tag = ksketchsvg.createTag(objectID, path, color, thickness, kobject.attrib['centroid']) if parentID == "0": result_soup.insert(len(result_soup.find_all('g', recursive=False)), tag) else: grp = result_soup.find('g', {'id': parentID}) if grp: grp.insert(len(grp.find_all('g', recursive=False)), tag) else: tag = ksketchsvg.createGroup(objectID) if parentID == "0": result_soup.insert(len(result_soup.find_all('g', recursive=False)), tag) else: grp = result_soup.find('g', {'id': parentID}) if grp: grp.insert(len(grp.find_all('g', recursive=False)), tag) soup = BeautifulSoup() g_tag = Tag(soup, name='g') g_tag['id'] = "0" g_tag.insert(0, result_soup) SVGCache.addSVGData(sketchID,version,g_tag.prettify()) return g_tag.prettify()
def strip_html(path, i, label_xid=True): """Strip the HTML: get rid of scripts and interactions""" print '[{}] Reading {} ...'.format(i, path) with open(path, 'r', 'utf8') as fin: # TODO: Handle encodings soup = BeautifulSoup(fin.read(), 'html5lib') # Add doctype if missing if not has_doctype(soup): soup.insert(0, Doctype('html')) # Remove dangerous tags for x in soup('script'): x.extract() for x in soup('noscript'): x.extract() for x in soup('link'): if x.get('as') == 'script': x.extract() for x in soup('iframe'): x['src'] = '' # Fix styles for x in soup('style'): x.string = H.unescape(u"".join(unicode(y) for y in x.contents)) # Label all tags i = 1 for x in soup.body(True): for attr in list(x.attrs): if attr.startswith('on') or attr == 'srcset': del x[attr] if label_xid: x['data-xid'] = i i += 1 # Return return soup.prettify()
def build_tags_pages(articles): all_tags = {} for article in articles: for tag in article.tags: all_tags.setdefault(tag.lower(), ([], [])) all_tags[tag.lower()][0].append(tag) all_tags[tag.lower()][1].append(article) for tag, (representations, articles) in all_tags.items(): if len(set(representations)) > 1: print("WARNING: There are multiple representations for tag {}: {}". format(tag, ", ".join(representations))) build_tag_page(representations[0], articles) lower_tags = list(all_tags.keys()) lower_tags.sort(key=lambda x: (len(all_tags[x][1]), x)) tags = BeautifulSoup('', 'html.parser') for i, tag in enumerate(lower_tags): rep = all_tags[tag][0][0] count = len(all_tags[tag][1]) soup = BeautifulSoup( f'<li><a href="/tag/{tag}.html">{rep} ({count})</a></li>', 'html.parser') tags.insert(i, soup) write(TAGS_PAGE.format(tags=tags.prettify()), "Tags | Layog's blog", f'{SRC_DIR}/tags.html')
def response(self, flow: http.HTTPFlow): response = flow.response if CONTENT_TYPE in response.headers: if any( map(lambda t: t in response.headers[CONTENT_TYPE], RELEVANT_CONTENT_TYPES)): # Response is a web page; proceed. insertedScripts: List[str] = [] soup = BeautifulSoup(response.content, HTML_PARSER, from_encoding=inferEncoding(response)) requestURL = flow.request.pretty_url # should work in transparent mode too, unless the Host header is spoofed isApplicable: Callable[[Userscript], bool] = userscript.applicableChecker( requestURL) for script in self.userscripts: if isApplicable(script): useInline = ctx.options.inline or script.downloadURL is None if useInline and len(script.unsafeSequences) > 0: logError(unsafeSequencesMessage(script)) continue logInfo( f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ...""" ) result = inject( script, soup, Options( inline=ctx.options.inline, verbose=ctx.options.verbose, )) if type(result) is BeautifulSoup: soup = result insertedScripts.append(script.name + ( "" if script.version is None else " " + stringifyVersion(script.version))) else: logError( "Injection failed due to the following error:") logError(str(result)) index_DTD: Optional[int] = indexOfDTD(soup) # Insert information comment: if ctx.options.verbose: soup.insert( 0 if index_DTD is None else 1 + index_DTD, Comment(INFO_COMMENT_PREFIX + ("No matching userscripts for this URL." if insertedScripts == [] else "These scripts were inserted:\n" + bulletList(insertedScripts)) + "\n")) # Prevent BS/html.parser from emitting `<!DOCTYPE doctype html>` or similar if "DOCTYPE" is not all uppercase in source HTML: if index_DTD is not None and REGEX_DOCTYPE.match( soup.contents[index_DTD]): # There is a DTD and it is invalid, so replace it. soup.contents[index_DTD] = Doctype( re.sub(REGEX_DOCTYPE, "", soup.contents[index_DTD])) # Serialize and encode: response.content = str(soup).encode( fromOptional(soup.original_encoding, CHARSET_DEFAULT), "replace")
def prep_html(self, base_url, target): ''' Replace variables in the email HTML with proper values and insert the tracking image URL if needed. ''' # TODO; remove placeholder IP base_url = 'http://10.1.2.180:8080/' html = self.html html = html.replace(b'{{ fname }}', str.encode(target.first_name)) html = html.replace(b'{{ lname }}', str.encode(target.last_name)) html = html.replace( b'{{ name }}', str.encode('%s %s' % (target.first_name, target.last_name))) html = html.replace(b'{{ url }}', str.encode('%s' % target.result.tracker)) html = html.replace(b'{{ id }}', str.encode(target.result.tracker)) soup = BeautifulSoup(html, features='lxml') base = soup.new_tag('base', href=base_url) #soup.find('head').insert_before(base) soup.insert(1, base) if self.track: tracker = soup.new_tag('img', alt='', src='%s/pixel.png' % (target.result.tracker)) soup.find('body').insert_after(tracker) html = str(soup).encode() return html
def load_task_lists(): # initiate database connection db_pars_path = '/home/scube_backend/.keys/mongodb_pars.yaml' # note: hardcoded ! db_pars = yaml.load(open(db_pars_path)) db, db_client = utils.connect_mongoDB_server(db_pars) # query scflex control databse ans = db_client['Scflex_control']['task_list_monitoring'].find( {"role": "task_list_info"}) ans = list(ans) if len(ans) == 0: "" # get task list name task_list_names = [doc['db_name'] + '/' + doc['coll_name'] for doc in ans] # make the html soup = BeautifulSoup("", 'lxml') for name in task_list_names: new_tag = soup.new_tag('option') new_tag.attrs['value'] = name label = ' - '.join(name.split('/')) new_tag.insert(0, label) soup.insert(0, new_tag) # end for return str(soup)
def convert_gif_to_webm(link, file, ext, post_id): """ Конвертим гифки в webm """ clip = VideoFileClip(file) w, h = clip.size webm = BeautifulSoup("", "html5lib").new_tag("video") webm['autoplay'] = "" webm['loop'] = "" webm['controls'] = "" webm['style'] = "max-width: " + str(w) + "px;" source = BeautifulSoup("", "html5lib").new_tag("source") if ext == "webm": source['src'] = "/media" + link.group() else: file_out = uri_to_iri( "/root/myblog/myblog/blog/static/media/{}/{}/{}/{}-{}.webm".format( link.group("year"), link.group("month"), link.group("day"), link.group("file"), str(post_id))) link_out = uri_to_iri('/media/{}/{}/{}/{}-{}.webm'.format( link.group("year"), link.group("month"), link.group("day"), link.group("file"), str(post_id))) clip = VideoFileClip(file) video = CompositeVideoClip([clip]) video.write_videofile(file_out, codec='libvpx', audio=False, preset='superslow') source['src'] = link_out source['type'] = "video/webm" webm.insert(0, source) return webm
def set_doctype(soup: bs4.BeautifulSoup, version: str) -> None: if version not in DOCTYPES: raise ValueError('unsupported version: %s' % version) new_doctype = bs4.Doctype.for_name_and_ids(*DOCTYPES[version]) for item in soup.contents: if isinstance(item, bs4.Doctype): item.replaceWith('') soup.insert(0, new_doctype)
def response(self, flow: http.HTTPFlow): response = flow.response if CONTENT_TYPE in response.headers: if any( map(lambda t: t in response.headers[CONTENT_TYPE], RELEVANT_CONTENT_TYPES)): # Response is a web page; proceed. insertedScripts: List[str] = [] soup = BeautifulSoup(response.content, HTML_PARSER, from_encoding=inferEncoding(response)) requestURL = flow.request.pretty_url # should work in transparent mode too, unless the Host header is spoofed if requestContainsQueryParam( option(T.option_query_param_to_disable), flow.request): logInfo( f"""Not injecting any userscripts into {requestURL} because it contains a `{option(T.option_query_param_to_disable)}` query parameter.""" ) return isApplicable: Callable[[Userscript], bool] = userscript.applicableChecker( requestURL) for script in self.userscripts: if isApplicable(script): useInline = option( T.option_inline) or script.downloadURL is None if useInline and len(script.unsafeSequences) > 0: logError(unsafeSequencesMessage(script)) continue logInfo( f"""Injecting {script.name}{"" if script.version is None else " " + VERSION_PREFIX + script.version} into {requestURL} ({"inline" if useInline else "linked"}) ...""" ) result = inject( script, soup, Options(inline=option(T.option_inline), )) if type(result) is BeautifulSoup: soup = result insertedScripts.append(script.name + ( "" if script.version is None else " " + T.stringifyVersion(script.version))) else: logError( "Injection failed due to the following error:") logError(str(result)) index_DTD: Optional[int] = indexOfDTD(soup) # Insert information comment: if option(T.option_list_injected): soup.insert( 0 if index_DTD is None else 1 + index_DTD, Comment(HTML_INFO_COMMENT_PREFIX + ("No matching userscripts for this URL." if insertedScripts == [] else "These scripts were inserted:\n" + bulletList(insertedScripts)) + "\n")) # Serialize and encode: response.content = str(soup).encode( fromOptional(soup.original_encoding, CHARSET_DEFAULT), "replace")
def download_all_images(self): # download method를 이용하여 get_image_url_list method에 반환값 즉, src의 값을 url에 넣고 실행 for url in self.get_image_url_list(): self.download(url) soup = BeautifulSoup('data/{}/{}.html'.format(self.webtoon_id, self.no), 'lxml') tag1 = Tag(name="html") soup.insert(0, tag1)
def clean_text(self): text = self.cleaned_data["text"] soup = BeautifulSoup(text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) return str(soup)
def addTagReferences(self, dirResult, fname, tagTypeCorpus, typeCorpus, refsAfterSVM=[]): #get "listRef" to check deleted notes """ Add ignored tags from initial file Check the SVM classification result of reference to give <nonbibl> tag at the final construction Call File::buildReferences for the modification and punctuation management then print the result Parameters ---------- dirResult : string directory for output files fname : string output filename tagTypeCorpus : typeCorpus : int, {1, 2, 3} type of corpus 1 : corpus 1, 2 : corpus 2... refsAfterSVM : list """ tmp_str = "" references = [] fileRes = dirResult+fname for line in open (fileRes, 'r', encoding='utf8') : tmp_str = tmp_str + ' ' + line soup = BeautifulSoup (tmp_str) s = soup.findAll ("bibl") cpt = 0 #total reference count for fichier in self.fichiers: # Original data nbRefFile = fichier.nbReference(typeCorpus) references[:] = [] cptRef = 0 # reference count in the file for ref in s: if cptRef < nbRefFile: if len(refsAfterSVM) > 0 and refsAfterSVM[cpt].train == -1 : #if the note (now tagged as <bibl>) is classified non-bibl for tag in (s[cpt]).findAll(True) : tag.replaceWith(tag.renderContents()) s2 = BeautifulSoup() #prepare tag sets <bibl><nonbibl></nonbibl></bibl> tag1 = s2.new_tag("bibl") tag2 = s2.new_tag("nonbibl") s2.insert(0, tag1) tag1.insert(0, tag2) tag2.insert(0, s[cpt].renderContents()) #put the unwrapped contents in the middle of above tag sets references.append(s2.find("bibl")) #make s2 have found bibl else : references.append(s[cpt]) else: break cptRef += 1 cpt += 1 'Build references in the original files and save them the root of dirResult' dirResultRoot = os.path.abspath(os.path.join(dirResult, os.path.pardir))+'/' fichier.buildReferences(references, tagTypeCorpus, dirResultRoot) #new result printing return
def append_sender_to_message(message_plain: str, message_html: str, sender: str) -> Tuple[str, str]: message_plain = f"{sender}: {message_plain}" message_soup = BeautifulSoup(message_html, "html.parser") sender_name_soup = BeautifulSoup(f"<b>{sender}</b>: ", "html.parser") first_tag = message_soup.find() if first_tag.name == "p": first_tag.insert(0, sender_name_soup) else: message_soup.insert(0, sender_name_soup) return message_plain, str(message_soup)
def _inline_script(script_tag: PageElement, script_file: Path) -> bool: """ replacement callable to replace scripts for inline_data """ script_content = NavigableString(script_file.read_text()) new_script_tag = BeautifulSoup(features="html.parser").new_tag("script") new_script_tag.insert(0, script_content) new_script_tag["type"] = "text/javascript" script_tag.replaceWith(new_script_tag)
def _inline_css(style_tag: PageElement, style_file: Path) -> bool: """ replacement callable to replace stylesheets for inline_data """ style_content = NavigableString(style_file.read_text()) new_style_tag = BeautifulSoup(features="html.parser").new_tag("style") new_style_tag.insert(0, style_content) new_style_tag["type"] = "text/css" style_tag.replaceWith(new_style_tag)
def append_sender_to_message(message_plain: str, message_html: str, sender: str) -> Tuple[str, str]: message_plain = "{}: {}".format(sender, message_plain) message_soup = BeautifulSoup(message_html, "html.parser") sender_name_soup = BeautifulSoup("<b>{}</b>: ".format(sender), "html.parser") first_tag = message_soup.find() if first_tag.name == "p": first_tag.insert(0, sender_name_soup) else: message_soup.insert(0, sender_name_soup) return message_plain, str(message_soup)
def add_mathjax(ast: BeautifulSoup) -> BeautifulSoup: src_1 = "https://polyfill.io/v3/polyfill.min.js?features=es6" tag_1 = ast.new_tag('script', src=src_1) src_2 = "https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js" # <script id="MathJax-script" async src=></script> tag_2 = ast.new_tag('script', src=src_2, id='MathJax-script') tag_2.attrs['async'] = None ast.insert(0, tag_2) ast.insert(0, tag_1) return ast
def get_html_listing_soup( in_folder: Union[Path, str], page_title: Optional[str] = None, out_file: Optional[Union[Path, str]] = None, ) -> BeautifulSoup: in_folder = Path(in_folder) soup = BeautifulSoup("", "html5lib") cast(Tag, soup.find("html"))["lang"] = "en" soup.insert(0, Doctype("html")) if page_title is None: page_title = in_folder.stem head = cast(Tag, soup.find("head")) title = soup.new_tag("title") title.string = page_title head.append(title) body = cast(Tag, soup.find("body")) ul: Tag = soup.new_tag("ul") body.append(ul) now_sec = int(time.time()) inlined_suffix_regex = re.compile(r"_inlined$") li: Tag for demo_full_path in sorted(in_folder.glob("**/*.html")): if demo_full_path.is_dir() or demo_full_path.name == "index.html": continue li = soup.new_tag("li") ul.append(li) demo_relative_path = urllib.parse.quote(str( demo_full_path.relative_to(in_folder)), safe="/") a = soup.new_tag( "a", href=(f"./{demo_relative_path}?t={now_sec}"), ) demo_name = inlined_suffix_regex.sub("", demo_full_path.stem) a.string = demo_name li.append(a) if out_file is None: out_file = in_folder / "index.html" _ = Path(out_file).write_text(str(soup)) return soup
def txt_link_downloader(html_link): soup = BeautifulSoup(html_link, 'html.parser') list_df = [] batch = soup.find_all('td') counter = 0 for index, i in enumerate(xrange(0, len(batch), 6)): list_df.append(map(lambda x: x.get_text(), batch[i:i + 6])) url_end = BeautifulSoup(batch[i + 2].encode('utf-8'), 'html.parser').find('a').get('href') url = 'http://www.the-numbers.com' + url_end list_df[index].append(url) response = urllib2.urlopen(url) main_doc = response.read() soup = BeautifulSoup(main_doc, 'html.parser') mpaaRating = [] for tr in soup.findAll('tr'): for td in tr.findAll('td'): mpaaRating.append(td.get_text()) mpaaRating = [unidecode.unidecode(x).strip() for x in mpaaRating] list_of_variables = [ 'Genre:', 'Running Time:', 'MPAA Rating:', 'Production Companies:', 'Domestic Releases:', 'Domestic DVD Sales', 'Domestic Blu-ray Sales', 'Total Domestic Video Sales', 'Rotten Tomatoes' ] second_page = solver(list_of_variables, mpaaRating) list_df[index].extend(second_page) response = urllib2.urlopen(url) main_doc = response.read() soup = BeautifulSoup(main_doc, 'html.parser') soup = soup.find(text=re.compile( 'Weekend Box Office Performance')).parent.parent.find( 'div', attrs={"id": "box_office_chart"}) try: soup = soup.get_text() soup = unicodedata.normalize('NFKD', soup).encode('utf-8').split()[4:35] soup.insert(3, 'None') list_df[index].extend(soup) except: pass counter += 1 #sets upper limit, max is 5230 as of 10/9/2016 if counter == 2000: return DataFrame(list_df)
def prep_html(self, base_url, target, result, url): ''' Replace variables in the email HTML with proper values and insert the tracking image URL if needed. ''' # get result for this target in this campaign #result = next((x for x in target.results if x.campaign_id == campaign_id), None) #result = Result.query.filter_by(campaign_id=int(campaign_id), person_id=target.id).first() # get if campaign is using SSL ssl = result.campaign.ssl # get port the worker will host on port = result.campaign.port # get the domain name the campaign is using domain = result.campaign.domain.domain payload_url_path = result.campaign.payload_url # determine if base URLs are using HTTP/HTTPS and include port number in URLs for non-standard ports if ssl: if port != 443: base_url = f'https://{domain}:{port}' payload_url = f'https://{domain}:{port}{payload_url_path}?id={result.tracker}' else: base_url = f'https://{domain}' payload_url = f'https://{domain}{payload_url_path}?id={result.tracker}' else: if port!= 80: base_url = f'http://{domain}:{port}' payload_url = f'http://{domain}:{port}{payload_url_path}?id={result.tracker}' else: base_url = f'http://{domain}' payload_url = f'http://{domain}{payload_url_path}?id={result.tracker}' if url[0] != '/': url = '/' + url html = self.html if target.first_name: html = html.replace(b'{{ fname }}', str.encode(target.first_name)) if target.last_name: html = html.replace(b'{{ lname }}', str.encode(target.last_name)) if target.first_name and target.last_name: html = html.replace(b'{{ name }}', str.encode('%s %s' % (target.first_name, target.last_name))) html = html.replace(b'{{ email }}', str.encode(target.email)) html = html.replace(b'{{ url }}', str.encode('%s%s?id=%s' % (base_url, url, result.tracker))) html = html.replace(b'{{ id }}', str.encode(result.tracker)) html = html.replace(b'{{ payload_url }}', str.encode(payload_url)) soup = BeautifulSoup(html, features='lxml') base = soup.new_tag('base', href=base_url) soup.insert(1, base) if self.track: tracker = soup.new_tag('img', alt='', src=f'{base_url}/default/{result.tracker}/logo.png') soup.find('body').insert_after(tracker) html = str(soup).encode() return html
def adder(fpath): with open(fpath) as fp: soup = BeautifulSoup(fp, "html.parser") new_child = r"{% load static %}" soup.insert(0, new_child) img = soup.find_all("img") link = soup.find_all("link") # for CSS use only script = soup.find_all("script") regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" try: for i in link: url = re.findall(regex, i.attrs["href"]) if url == []: temp = i.attrs["href"] temp2 = r"{% static '" + temp + r"' %}" i.attrs["href"] = temp2 except: pass try: for i in img: url = re.findall(regex, i.attrs["src"]) if url == []: temp = i.attrs["src"] temp2 = r"{% static '" + temp + r"' %}" i.attrs["src"] = temp2 except: pass try: for i in script: url = re.findall(regex, i.attrs["src"]) if url == []: temp = i.attrs["src"] temp2 = r"{% static '" + temp + r"' %}" i.attrs["src"] = temp2 # <script data-cfasync="false" src="../../cdn-cgi/scripts/5c5dd728/cloudflare-static/email-decode.min.js"></script> except: pass fp.close() # with open(fpath, "w") as file: # print(str(soup)) # file.write(str(soup)) import io with io.open(fpath, "w", encoding="utf-8") as f: f.write(str(soup)) return r'Added {% static '' %} in ' + fpath + ' ...Done'
def clean_summary(instance): if type(instance) == Article: summary = instance.summary summary = BeautifulSoup(instance.summary, 'html.parser') images = summary.findAll('img') if (len(images) > maximum_images): for image in images[maximum_images:]: image.extract() if len(images) < 1 and minimum_one: #try to find one content = BeautifulSoup(instance.content, 'html.parser') first_image = content.find('img') if first_image: summary.insert(0, first_image) instance._summary = text_type(summary)
def clean_text(self): text = self.cleaned_data["text"] soup = BeautifulSoup(text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) imgid = 0 for img in soup.findAll("img"): img["id"] = "img%s" % imgid imgid += 1 return str(soup)
def split_chars(data): soup = BeautifulSoup(data, 'html.parser') paths=[] svgs=[] for i in range(6): a=soup.find('path').extract() if(not a.get('fill')=='none'): paths.append(a) for path in paths: outer_tag = BeautifulSoup(str(soup), 'html.parser').find('svg').extract() outer_tag.insert(1,path) svgs.append(str(outer_tag)) return svgs
def build_card(article): tags = BeautifulSoup('', 'html.parser') for i, tag in enumerate(article.tags): soup = BeautifulSoup(f'<li><a href="#">{tag}</a></li>', 'html.parser') tags.insert(i, soup) card = ARTICLE_CARD.format( title=article.title, tags=tags.prettify(), date=article.date.strftime('%Y-%m-%d'), readable_date=article.date.strftime('%B %d, %Y'), summary=article.summary.prettify(), article_link=article.path) return BeautifulSoup(card, 'html.parser')
def get_content_to_file( url, htmlfilename ): with requests.Session( ) as s: s.headers = { 'Content-Type' : 'application/json', 'x-api-key' : _apiKey } response = s.get( 'https://mercury.postlight.com/parser', params = { 'url' : url }) if response.status_code != 200: return 'Error, no data from %s' % url data = response.json( ) content = data['content'] title = data['title'] date_publish_string = data['date_published'] excerpt = data['excerpt'] url = data['url'] html = BeautifulSoup( content, 'lxml' ) # html = BeautifulSoup( content, 'html5lib' ) # ## now all png objects to inline if not os.path.exists(title): os.mkdir(title) for img in html.find_all('img'): imgURL = img['src'] split = urlparse.urlsplit(imgURL) filename = "./%s/"%(title) + split.path.split("/")[-1] urllib.urlretrieve(imgURL, filename) img['src'] = filename # if imgURL.lower( ).endswith('.png'): # img_64 = "data:image/png;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) ) # elif imgURL.lower( ).endswith( '.jpg' ): # img_64 = "data:image/jpg;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) ) # else: # img_64 = None # # # if img_64 is not None: # img['src'] = img_64 if not os.listdir(title) : os.rmdir(title) htag = html.new_tag( 'head' ) mtag = html.new_tag( 'meta' ) mtag['charset'] = 'utf-8' htag.append( mtag ) html.insert(0, htag ) if not htmlfilename : htmlfilename = title + '.html' with codecs.open( htmlfilename, 'w', 'utf-8') as openfile: openfile.write('%s\n' % html.prettify( ) )
def get_content_to_file(url, htmlfilename): with requests.Session() as s: s.headers = {'Content-Type': 'application/json', 'x-api-key': _apiKey} response = s.get('https://mercury.postlight.com/parser', params={'url': url}) if response.status_code != 200: return 'Error, no data from %s' % url data = response.json() content = data['content'] title = data['title'] date_publish_string = data['date_published'] excerpt = data['excerpt'] url = data['url'] # html = BeautifulSoup( content, 'lxml' ) html = BeautifulSoup(content, 'html5lib') # ## now all png objects to inline if not os.path.exists(title): os.mkdir(title) for img in html.find_all('img'): imgURL = img['src'] split = urlparse.urlsplit(imgURL) filename = "./%s/" % (title) + split.path.split("/")[-1] urllib.urlretrieve(imgURL, filename) img['src'] = filename # if imgURL.lower( ).endswith('.png'): # img_64 = "data:image/png;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) ) # elif imgURL.lower( ).endswith( '.jpg' ): # img_64 = "data:image/jpg;base64," + base64.b64encode( urllib.urlopen( imgURL ).read( ) ) # else: # img_64 = None # # # if img_64 is not None: # img['src'] = img_64 if not os.listdir(title): os.rmdir(title) htag = html.new_tag('head') mtag = html.new_tag('meta') mtag['charset'] = 'utf-8' htag.append(mtag) html.insert(0, htag) if not htmlfilename: htmlfilename = title + '.html' with codecs.open(htmlfilename, 'w', 'utf-8') as openfile: openfile.write('%s\n' % html.prettify())
def create_xml(): """ Create XML. Return string (soup.prettify()). """ soup = BeautifulSoup() tags = [{ "tag": "root", "count": 1, "parent": None }, { "tag": "var", "count": 2, "parent": "root", "attr": { "name": ["id", "level"], "value": ["rand_str", "rand_int"] } }, { "tag": "objects", "count": 1, "parent": "root" }, { "tag": "object", "count": int(random.uniform(1, 11)), "parent": "objects", "attr": { "name": "rand_str" } }] for tag_dict in tags: parent = soup.find(tag_dict["parent"]) for tag_number in range(tag_dict["count"]): new_tag = soup.new_tag(tag_dict["tag"]) attr_dict = tag_dict.get("attr") if attr_dict is not None: for attr_key in attr_dict: if isinstance(attr_dict[attr_key], list): param = attr_dict[attr_key][tag_number] new_tag[attr_key] = return_value(param) else: new_tag[attr_key] = return_value(attr_dict[attr_key]) if parent is None: soup.insert(0, new_tag) else: parent.insert(len(parent), new_tag) return soup.prettify()
def insert_base_markup(soup: BeautifulSoup) -> BeautifulSoup: head = soup.new_tag("head") soup.insert(0, head) simplecss_link: Tag = soup.new_tag("link") # <link rel="stylesheet" href="https://cdn.simplecss.org/simple.css"> simplecss_link["rel"] = "stylesheet" simplecss_link["href"] = "https://cdn.simplecss.org/simple.css" head.append(simplecss_link) # Basic style tags for compat style: Tag = soup.new_tag("style") style.append(_STYLE_TAG_CONTENT) head.append(style) return soup
def save(self, commit=True): m = super(MailWithAttachmentForm, self).save(commit=False) soup = BeautifulSoup(m.text, "html.parser") if not isinstance(soup.contents[0], Doctype): doctype = Doctype("html") soup.insert(0, doctype) m.text = str(soup) m.template_type = "2" if commit: m.save() return m
def processSaveReport(): print('Processing and saving report') cwd = os.getcwd() folder = 'templates' filename = 'svreport.html' svreportLocation = cwd + os.sep + folder + os.sep + filename try: soup = BeautifulSoup(open(svreportLocation), features="html.parser") print(f'Successfully parsed report at location: {svreportLocation}') except: print(f"Couldn't parse report at location: {svreportLocation}") navUpdateString = """ //Sets nav-link to active for this page. $(document).ready(function () { $('#datagraphs').addClass('active'); }); """ insertTopString = "{% extends 'base.html' %}\n{% block content %}" insertBottomString = "{% endblock %}" #Remove doctype: for item in soup.contents: if isinstance(item, Doctype): item.extract() #end remove doctype #Remove <html></html> tags for match in soup.findAll('html'): match.unwrap() #Remove <link> tags soup.find('link').extract() soup.insert(0, insertTopString) soup.insert(len(soup) + 1, insertBottomString) #script insert scriptTag = soup.new_tag('script') scriptTag.append(navUpdateString) #soup.body.insert(len(soup.body.contents), navUpdateString) head = soup.find('head') head.insert(1, scriptTag) #print(soup.prettify()) writeFile(svreportLocation, soup)
def _create_new_soup_with_div_container(self, doctype_element, head_element, title_string): container_div = "<div class='container'></div>" body_soup = BeautifulSoup(str(container_div), "html5lib") body_soup = self._insert_google_adsense(body_soup) # Insert document type body_soup.insert(0, doctype_element) # Insert head element title = body_soup.new_tag('title') title.string = title_string head_clone = self._create_clone(head_element) head_clone.append(title) new_first_page_head = body_soup.head new_first_page_head.replace_with(head_clone) return body_soup
def clean_summary(instance): if type(instance) == Article: summary = BeautifulSoup(instance.summary, 'html.parser') if clean_target_tag: # remove specific type of tags with class remove_targets = summary.findAll(clean_target_tag, attrs={"class": clean_target_class}) for target in remove_targets: # print(target) target.extract() images = summary.findAll('img') if (len(images) > maximum_images): for image in images[maximum_images:]: image.extract() if len(images) < 1 and minimum_one: #try to find one content = BeautifulSoup(instance.content, 'html.parser') first_image = content.find('img') if first_image: summary.insert(0, first_image) instance._summary = text_type(summary)
def prettyprint_email_as_html(email_json): soup = BeautifulSoup() html = soup.new_tag('html') head = soup.new_tag('head') title = soup.new_tag('title') html.append(head) head.append(title) title.append(email_json["id"]) body = soup.new_tag('body') html.append(body) body = soup.new_tag('body') soup.insert(0, body) body.append("ID: {0}".format(email_json["id"])) body.append("From: {0}".format(email_json["senders_line"][0])) body.append(soup.new_tag('br')) body.append("From: {0}".format(email_json["senders_line"][0])) body.append(soup.new_tag('br')) body.append("To: {0}".format(email_json["tos_line"][0])) body.append(soup.new_tag('br')) if email_json["ccs_line"]: body.append("Cc: {0}".format(email_json["ccs_line"][0])) body.append(soup.new_tag('br')) body.append("Sent: {0}".format(email_json["datetime"])) body.append(soup.new_tag('br')) body.append("Subject: {0}".format(email_json["subject"])) body.append(soup.new_tag('br')) pre = soup.new_tag('pre') pre.append(email_json["body"]) body.append(pre) return soup.prettify()
def build_dumb_bonita_error_body(exception='',code='',message=''): # Add your own Bonita java Exception in this dict to make your call shorter # So you can call with exception='UserNotFoundException' # rather than exception = 'org.ow2.bonita.facade.exception.UserNotFoundException' java_exception_dict = {'UserNotFoundException':'org.ow2.bonita.facade.exception.UserNotFoundException', 'ProcessNotFoundException':'org.ow2.bonita.facade.exception.ProcessNotFoundException', 'GroupNotFoundException':'org.ow2.bonita.facade.exception.GroupNotFoundException', 'RoleNotFoundException':'org.ow2.bonita.facade.exception.RoleNotFoundException'} exception_text = java_exception_dict.get(exception,exception) # Build XML body soup = BeautifulSoup('','xml') tag_exception = soup.new_tag(exception_text) tag_code = soup.new_tag('errorCode') tag_message = soup.new_tag('detailMessage') tag_code.string = code tag_message.string = message soup.insert(0,tag_exception) tag_exception.insert(0,tag_code) tag_exception.insert(1,tag_message) return unicode(soup)
soup = BeautifulSoup(features='html5lib') # create tags tag1 = soup.new_tag("person") tag2 = soup.new_tag("name") tag3 = soup.new_tag("location") # add attributes tag2['first'] = 'John' tag2['last'] = 'Smith' tag3['country'] = 'uk' # add text text = NavigableString("John Gary Smith") # build soup soup.insert(0, tag1) tag1.insert(0, tag2) tag1.insert(1, tag3) tag2.insert(0, text) print(soup) print("----------------") print(soup.prettify()) 1
def update_html_for_static(book, html_content, epub=False): soup = BeautifulSoup(html_content, 'lxml-html') # remove encoding as we're saving to UTF8 anyway encoding_specified = False for meta in soup.findAll('meta'): if 'charset' in meta.attrs: encoding_specified = True # logger.debug("found <meta> tag with charset `{}`" # .format(meta.attrs.get('charset'))) del(meta.attrs['charset']) elif 'content' in meta.attrs \ and 'charset=' in meta.attrs.get('content'): try: ctype, ccharset = meta.attrs.get('content').split(';', 1) except: continue else: encoding_specified = True # logger.debug("found <meta> tag with content;charset `{}`" # .format(meta.attrs.get('content'))) meta.attrs['content'] = ctype if encoding_specified: # logger.debug("charset was found and removed") pass # update all <img> links from images/xxx.xxx to {id}_xxx.xxx if not epub: for img in soup.findAll('img'): if 'src' in img.attrs: img.attrs['src'] = img.attrs['src'].replace( 'images/', '{id}_'.format(id=book.id)) # update all <a> links to internal HTML pages # should only apply to relative URLs to HTML files. # examples on #16816, #22889, #30021 def replacablement_link(book, url): try: urlp, anchor = url.rsplit('#', 1) except ValueError: urlp = url anchor = None if '/' in urlp: return None if len(urlp.strip()): nurl = "{id}_{url}".format(id=book.id, url=urlp) else: nurl = "" if anchor is not None: return "#".join([nurl, anchor]) return nurl if not epub: for link in soup.findAll('a'): new_link = replacablement_link( book=book, url=link.attrs.get('href', '')) if new_link is not None: link.attrs['href'] = new_link # Add the title if not epub: soup.title.string = book.title patterns = [ ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** END OF THE PROJECT GUTENBERG EBOOK"), ("***START OF THE PROJECT GUTENBERG EBOOK", "***END OF THE PROJECT GUTENBERG EBOOK"), ("<><><><><><><><><><><><><><><><><><><><><><><><><><><><>" "<><><><><><>", "<><><><><><><><><><><><><><><><><><><><><><><><><><><><>" "<><><><><><>"), # ePub only ("*** START OF THIS PROJECT GUTENBERG EBOOK", "*** START: FULL LICENSE ***"), ("*END THE SMALL PRINT! FOR PUBLIC DOMAIN ETEXT", "——————————————————————————-"), ("*** START OF THIS PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK"), ("***START OF THE PROJECT GUTENBERG", "***END OF THE PROJECT GUTENBERG EBOOK"), ("COPYRIGHT PROTECTED ETEXTS*END*", "==========================================================="), ("Nous remercions la Bibliothèque Nationale de France qui a mis à", "The Project Gutenberg Etext of"), ("Nous remercions la Bibliothèque Nationale de France qui a mis à", "End of The Project Gutenberg EBook"), ("==========================================================" "===============", "——————————————————————————-"), ("Project Gutenberg Etext", "End of Project Gutenberg Etext"), ("Text encoding is iso-8859-1", "Fin de Project Gutenberg Etext"), ("—————————————————-", "Encode an ISO 8859/1 " "Etext into LaTeX or HTML"), ] body = soup.find('body') try: is_encapsulated_in_div = sum( [1 for e in body.children if not isinstance(e, bs4.NavigableString)]) == 1 except: is_encapsulated_in_div = False if is_encapsulated_in_div and not epub: DEBUG_COUNT.append((book.id, book.title)) if not is_encapsulated_in_div: for start_of_text, end_of_text in patterns: if start_of_text not in body.text and end_of_text not in body.text: continue if start_of_text in body.text and end_of_text in body.text: remove = True for child in body.children: if isinstance(child, bs4.NavigableString): continue if end_of_text in getattr(child, 'text', ''): remove = True if start_of_text in getattr(child, 'text', ''): child.decompose() remove = False if remove: child.decompose() break elif start_of_text in body.text: # logger.debug("FOUND START: {}".format(start_of_text)) remove = True for child in body.children: if isinstance(child, bs4.NavigableString): continue if start_of_text in getattr(child, 'text', ''): child.decompose() remove = False if remove: child.decompose() break elif end_of_text in body.text: # logger.debug("FOUND END: {}".format(end_of_text)) remove = False for child in body.children: if isinstance(child, bs4.NavigableString): continue if end_of_text in getattr(child, 'text', ''): remove = True if remove: child.decompose() break # build infobox if not epub: infobox = jinja_env.get_template('book_infobox.html') infobox_html = infobox.render({'book': book}) info_soup = BeautifulSoup(infobox_html, 'lxml-html') body.insert(0, info_soup.find('div')) # if there is no charset, set it to utf8 if not epub: meta = BeautifulSoup('<meta http-equiv="Content-Type" ' 'content="text/html; charset=UTF-8" />', 'lxml-html') head = soup.find('head') html = soup.find('html') if head: head.insert(0, meta.head.contents[0]) elif html: html.insert(0, meta.head) else: soup.insert(0, meta.head) return html return soup
browser.open(base + "/news") link = browser.get_link(text="UK") browser.open(base + link['href']) soup = browser.parsed # pick out anchors that are tagged with the story class # tags = soup.findAll("a", "story") tags = soup.findAll("a") newSoup = BeautifulSoup(features="html5lib") for tag in tags: # add base url if it is missing from href if tag['href'][0] == "/": tag['href'] = base + tag['href'] # add tag to new soup followed by a <br> newSoup.insert(0, tag) br = soup.new_tag("br") newSoup.insert(0, br) # convert soup into a string data = str(newSoup) # save scraped html to a file try: f = open("out.html", "w", encoding="UTF-8") f.write(data) f.close() except IOError as e: print(e) # display local file in browser
<img src="./output/wc.png" alt="" height="250" width="250" /> <div id=hashtag"> <h5>Hashtag Count</h5> </div> <img src="./output/hashtag.png" alt="" height="250" width="250" /> </div> <table> </table> </body> </html> ''' '''Inserts the URL sentiment into the HTML tag <table>''' soup = BeautifulSoup(doc,'html.parser') body = soup.new_tag('body') soup.insert(0, body) table = soup.new_tag('table') body.insert(0, table) with open("./output/url_sentiment.txt") as infile: for line in infile: row = soup.new_tag('tr') col1, col2, col3 = line.split() for coltext in (col3, col2, col1): # important that you reverse order col = soup.new_tag('td') col.string = coltext row.insert(0, col) table.insert(len(table.contents), row) with open('sentiment.html', 'w') as outfile: