def sanitize_html(content): """ Converts content to safe HTML """ sanitizer = Sanitizer() content = sanitizer.sanitize(content) return content
def parse_opportunity(self, response): full_payload = "" # Blurt it all out in full for now. sanitizer = Sanitizer({ 'tags': ('hr', 'a', 'br', 'p', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'table', 'tbody', 'tr', 'td'), }) matches = response.xpath('.//div[@class="column-one-whole"]').extract() for item in matches: full_payload = "".join(sanitizer.sanitize(item)) self.store_opportunity({ 'title': response.css('header h1::text').extract_first().strip(), 'url': response.request.url, 'incomplete_applications': response.css('#incomplete-applications .big-statistic::text' ).extract_first().strip(), 'complete_applications': response.css('#completed-applications .big-statistic::text'). extract_first().strip(), 'full_text': full_payload, })
def edit_profile(request): user = get_object_or_404(get_user_model(), pk=request.user.pk) if hasattr(user, 'userprofile'): up_instance = user.userprofile else: up_instance = None if request.method == "POST": user_form = P7UserChangeForm(request.POST, instance=user, initial={'confirm_email': user.email}) profile_form = UserProfileForm(request.POST, request.FILES, instance=up_instance) if all([user_form.is_valid(), profile_form.is_valid()]): user_form.save() profile = profile_form.save(commit=False) profile.user = user sanitizer = Sanitizer() profile.bio = sanitizer.sanitize(profile.bio) profile.save() return redirect(reverse('accounts:profile')) else: # GET user_form = P7UserChangeForm(instance=user, initial={'confirm_email': user.email}) profile_form = UserProfileForm(instance=up_instance) template = 'accounts/edit_profile.html' context = {'user_form': user_form, 'profile_form': profile_form} return render(request, template, context)
def getList(): sanitizer = Sanitizer() number_list = "" for file in os.listdir("C:/Bitnami/wampstack-7.3.6-2/apache2/htdocs/data"): number_list = number_list + "<li><a href = 'index.py?id={name}'>{name}</a></li>".format( name=file) number_list = sanitizer.sanitize(number_list) return number_list
def getList(): sanitizer = Sanitizer() files = os.listdir('data') listStr = '' for item in files: listStr += '<li><a href="index.py?id={name}">{name}</a></li>'.format( name=sanitizer.sanitize(item)) return listStr
def do_GET(self): try: request_url = urlparse(self.requestline) all_companies = request_url.query.split( ',') # Get all company names from URL allJobsXml = ET.Element('jobs') for i in range(len(all_companies)): r = requests.get( f"https://apply.workable.com/api/v1/widget/accounts/{all_companies[i].split()[0]}?details=true" ) data = r.json() for job in data["jobs"]: jobXml = ET.SubElement(allJobsXml, 'job') titleXml = ET.SubElement(jobXml, 'title') titleXml.text = job["title"] employerXml = ET.SubElement(jobXml, 'Email') employerXml.text = f"team+{all_companies[i].split()[0]}@climate.careers" urlXml = ET.SubElement(jobXml, 'url') urlXml.text = job["url"] locationXml = ET.SubElement(jobXml, 'location') locationXml.text = job["city"] + ", " + job[ "state"] + ", " + job["country"] # HTML Sanitizer sanitizer = Sanitizer({ 'tags': ('em', 'strong', 'a', 'p', 'br', 'span', 'ul', 'ol', 'li', 'h1', 'h2', 'h3', 'hr'), 'attributes': { "a": "href" }, }) descriptionXml = ET.SubElement(jobXml, 'description') descriptionXml.text = sanitizer.sanitize( job["description"]) idXml = ET.SubElement(jobXml, 'id') idXml.text = job["shortcode"] self.send_response(200) self.send_header('Content-type', 'text/xml') self.send_header( 'Cache-Control', 'public, immutable, no-transform, s-maxage=3600, max-age=3600') self.end_headers() message = ET.tostring(allJobsXml) self.wfile.write(message) return except Exception as e: self.send_response(500) self.send_header('Content-type', 'text/html') self.end_headers() message = f"<h1>Internal Error</h1><p>Sorry, there was a problem. Make sure the employer's name is " \ "included in the URL query and/or is the correct name.</p> " message2 = f"<p>{e}</p>" self.wfile.write(message2.encode()) return
def post(self): """Update announcement""" user = self.get_current_user() sanitizer = Sanitizer() # announcement = self.get_body_argument("announcement") announcement = sanitizer.sanitize( self.get_body_argument("announcement")) self.queue.update(user["name"], announcement) self.redirect(self.application.reverse_url("view"))
def get_list(): sanitizer = Sanitizer() files = os.listdir('data') str_lst = '' for item in files: item = sanitizer.sanitize(item) str_lst = str_lst + '<li><a href="index.py?id={name}">{name}</a></li>'.format( name=item) return str_lst
def handle(event, context): ddb = boto3.resource('dynamodb') messages_table = MessagesStore(ddb) if "getAllMessages" in json.loads(event["body"]): connections_url = "https://" + event["requestContext"][ "domainName"] + "/" + os.environ["Stage"] gatewayapi = boto3.client("apigatewaymanagementapi", endpoint_url=connections_url) apigateway_response = gatewayapi.post_to_connection( ConnectionId=event["requestContext"]["connectionId"], Data=json.dumps(messages_table.get_messages()).encode('utf-8')) return {"statusCode": 200} connections_table = ConnectionsStore(ddb) data = {"connectionId": event["requestContext"]["connectionId"]} connections_ids = connections_table.get_connections() sanitizer = Sanitizer() chat_message = sanitizer.sanitize(json.loads(event["body"])["data"]) lambda_client = boto3.client('lambda') analysis_handler_function_name = "{}-{}".format(os.environ["Stage"], "AnalysisHandler") response = lambda_client.invoke( FunctionName=analysis_handler_function_name, InvocationType='RequestResponse', Payload=json.dumps({"message": chat_message})) sentiment_response = response['Payload'].read().decode("utf-8") messageToSend = { "action": "sendMessage", "data": { "message": chat_message, "sentiment": sentiment_response } } message_data_for_ddb = { "date": date.today().strftime("%d-%m-%Y"), "timestamp": int(round(time.time() * 1000)), "message": messageToSend["data"] } messages_table.add_new_message(**message_data_for_ddb) for connection_id in connections_ids: connections_url = "https://" + event["requestContext"][ "domainName"] + "/" + os.environ["Stage"] gatewayapi = boto3.client("apigatewaymanagementapi", endpoint_url=connections_url) apigateway_response = gatewayapi.post_to_connection( ConnectionId=connection_id['connectionId'], Data=json.dumps(messageToSend).encode('utf-8')) return {"statusCode": 200}
def process_item(self, item, spider): if spider.name != 'these_people': return item sanitizer = sanitizer = Sanitizer({ 'empty': { 'h1', 'h2', 'h3', 'strong', 'em', 'p', 'ul', 'ol', 'li', 'br', 'sub', 'sup', 'hr', 'a' }, 'separate': set() }) if (item['languages'][0] == 'Sprachkenntnisse:'): item['languages'] = item['languages'][1:] if (item['education'][0] == 'Abschluss:'): item['education'] = item['education'][1:] if (item['hourly_daily_rate'][0] == 'Stunden-/Tagessatz:'): item['hourly_daily_rate'] = item['hourly_daily_rate'][1:] h_d_rate = ' '.join(item['hourly_daily_rate']) h_d_rate = re.sub(' +|\n', '|', h_d_rate).split('|') item['hourly_daily_rate'] = list( filter(None, [i.strip() for i in h_d_rate if i])) item['languages'] = ''.join(item['languages']).split('|') item['skills'] = sanitizer.sanitize(''.join(item['skills'])) item['other_info'] = sanitizer.sanitize(''.join(item['other_info'])) item['availability'] = sanitizer.sanitize(''.join( item['availability'])) return item
def clean_webpage(cls, item): if item is None: return item, None, None with open(item, "r", encoding="utf-8") as input_f: html = input_f.read() soup = BeautifulSoup(html, "lxml") cls.bs4_aggressive_remove(soup) sanitized = Sanitizer(settings=config.sanitizer_settings).sanitize( str(soup)) fresh_soup = BeautifulSoup(sanitized, "lxml") sanitized_policy = os.path.abspath( os.path.join(config.processed_policies, os.path.basename(item))) with open(sanitized_policy, "w", encoding="utf-8") as output_f: output_f.write(f"<html>\n" f"<head>\n" f"\t<meta charset=\"utf-8\"/>\n" f"\t<title></title>\n" f"</head>\n" f"{cls.prettify(fresh_soup.body)}\n" f"</html>") return item, sanitized_policy
def _compose_html(self, tagged_text, hard_words): sanitizer = Sanitizer({ 'tags': {'p', 'mark', 'span'}, 'attributes': { 'span': ('class', ) }, 'empty': set(), 'separate': {'p', 'mark', 'span'}, }) html_paragraphs = [] for paragraph in tagged_text.iter('chunk'): sentences = [] for sentence in paragraph.iter('sentence'): no_space = False s = '' for pos, el in enumerate(sentence): if el.tag == 'ns': no_space = True continue elif el.tag == 'tok': orth = el.xpath('orth/text()')[0] base = el.xpath('lex/base/text()')[0] if orth in hard_words or base in hard_words: orth = '<mark>{}</mark>'.format(orth) if no_space: s += orth no_space = False else: s += ' ' + orth sentence_length = sentence.xpath('count(./tok)') if sentence_length > 20: self.long_sentence_count += 1 s = '<span class="long_sentence">{}</span>'.format(s) sentences.append(s) html_paragraphs.append('<p>{}</p>'.format(' '.join(sentences))) html_text = ''.join(html_paragraphs) return sanitizer.sanitize(html_text)
def get_content(url): try: web_page = requests.get(url, allow_redirects=True, timeout=5) except requests.exceptions.Timeout as err: raise ServerException( description= "Failed to fetch web page in time! Skipping preprocessing") soup = BeautifulSoup(web_page.content, "html.parser") content = str(soup.section) try: for field in soup.find('ul', attrs={"class": "details"}).children: field_str = str(field) field_str = field_str.replace("\n", "") print(field_str) if "#icon-contract" in field_str: m = findall(r"svg>(\w|\s)+<\/li>", field_str) # print(field.string) sanitiser = Sanitizer() content = sanitiser.sanitize(content) return {"post_details": content, "fields": {}} except: raise InvalidUserInput( description="Couldn't find details for that job")
def _decode_data_url(cls, url): """Convert a data: URL to a string of sanitized HTML. :raise ValueError: If the data: URL is invalid, in an unexpected format, or does not have a supported media type. :return: A string. """ if not url.startswith("data:"): raise ValueError("Not a data: URL: %s" % url) parts = url.split(",") if len(parts) != 2: raise ValueError("Invalid data: URL: %s" % url) header, encoded = parts if not header.endswith(";base64"): raise ValueError("data: URL not base64-encoded: %s" % url) media_type = header[len("data:"):-len(";base64")] if not any( media_type.startswith(x) for x in ("text/html", "text/plain")): raise ValueError("Unsupported media type in data: URL: %s" % media_type) html = base64.b64decode(encoded) return Sanitizer().sanitize(html)
CONF_DICT['gui_chat']['style_settings'] = LCStaticBox() CONF_DICT['gui_chat']['style_settings']['show_system_msg'] = LCBool(True) CONF_DICT['gui_chat']['style_settings']['show_history'] = LCBool(True) CONF_DICT['server_chat'] = LCPanel() CONF_DICT['server_chat']['style'] = LCChooseSingle(DEFAULT_STYLE, check_type='dir', folder='http', empty_label=True) CONF_DICT['server_chat']['style_settings'] = LCStaticBox() CONF_DICT['server_chat']['style_settings']['show_system_msg'] = LCBool(True) CONF_DICT['server_chat']['style_settings']['show_history'] = LCBool(True) TYPE_DICT = {TextMessage: 'message', CommandMessage: 'command'} SANITIZER = Sanitizer() def process_emotes(emotes): return [{ 'id': EMOTE_FORMAT.format(emote.id), 'url': emote.url } for emote in emotes] def process_badges(badges): return [{'badge': badge.id, 'url': badge.url} for badge in badges] def process_platform(platform): return {'id': platform.id, 'icon': platform.icon}
def update_temp(): # Get local cottage temp f = open('/sys/bus/w1/devices/28-000007171178/w1_slave', 'r') cot = f.readline() if "YES" not in cot: raise Exception("Bad CRC for Cottage temp sensor") cot = f.readline() cot = cot.split("t=") cot = cot[1].rstrip("\n") cot = temp_to_format_str(float(cot)/1000) f.close() # Get local cottage temp and humidity from DHT f = open('/sys/bus/iio/devices/iio:device0/in_temp_input', 'r') success = False tries = 0 while not(success): try: cot2 = temp_to_format_str(float(f.readline())/1000) success = True except OSError as e: tries = tries + 1 if tries > 10: cot2 = "ERR" break f.close() f = open('/sys/bus/iio/devices/iio:device0/in_humidityrelative_input', 'r') success = False tries = 0 while not(success): try: hm = str(round(float(f.readline())/1000,2)) + " %" success = True except OSError as e: tries = tries + 1 if tries > 10: hm = "ERR" break f.close() tempCottage.set(cot) tempCottage2.set(cot2) humid.set(hm) # Get info from pumphouse r = requests.get("http://192.168.0.180/temp") # Set encoding to UTF-8 since the HTML is in it but apparently the ESP sends a different one r.encoding = "UTF-8" # Throw error if applicable if not(r.ok): lakeTempVar = "ERR" outsideTempVar = "ERR" intakeTempVar = "ERR" ambientTempVar = "ERR" else: # Clean up and split response sanitizer = Sanitizer() txt = sanitizer.sanitize(r.text) # Split on line breaks and remove all but temp info txtSplit = txt.split("<br>")[1:5] for item in txtSplit: tempS = item.split(":") tmpValue = float(removeNonDecimal(tempS[1])) if "Lake" == tempS[0]: lakeTempVar = temp_to_format_str(tmpValue) elif "Outside" == tempS[0]: outsideTempVar = temp_to_format_str(tmpValue) elif "Pump intake" == tempS[0]: intakeTempVar = temp_to_format_str(tmpValue) elif "Ambient outdoor" == tempS[0]: ambientTempVar = temp_to_format_str(tmpValue) tempLake.set(lakeTempVar) tempOutside.set(outsideTempVar) tempIntake.set(intakeTempVar) tempAmb.set(ambientTempVar) csvwriter.writerow({'Time': time.strftime("%F %T"), 'Outside Temp': removeNonDecimal(outsideTempVar), 'Cottage Temp': removeNonDecimal(cot), 'Cottage Temp (DHT)': removeNonDecimal(cot2), 'Lake Temp': removeNonDecimal(lakeTempVar), 'Pump Intake': removeNonDecimal(intakeTempVar), 'Ambient Lake Temp': removeNonDecimal(ambientTempVar), 'Humidity': removeNonDecimal(hm)})
from airflow.models import Variable from airflow.hooks.postgres_hook import PostgresHook from html_sanitizer import Sanitizer import nltk from utilities import safeget from rock_media import is_media_video, is_media_audio import requests nltk.download('punkt') summary_sanitizer = Sanitizer({ 'tags': {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}, 'empty': {}, 'separate': {}, 'attributes': {}, }) html_allowed_tags = { 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'p', 'a', 'ul', 'ol',
from django.shortcuts import render, redirect import random from main.forms import AdModelForm, FiltersForm from main.models import User, Ad, Skill, PetProject, Responsibility, Pide from djmoney.money import Money from html_sanitizer import Sanitizer sanitizer = Sanitizer() sanitizer.tags = set(sanitizer.tags).union([ 'p', 'span', 'i', 'u', 'hr', 'ol', 'li', 'br', 'blockquote', 'ul', 'pre', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' ]) ################################################## # Helper functions ################################################## def negate_ad(ad_type): assert (ad_type == "vacancy" or ad_type == "resume") return 'vacancy' if ad_type == 'resume' else 'resume' def save_skills(post, ad): i = 1 last = post.get('skill1') while last is not None: Skill.objects.create(text=last, ad_id=ad).save()
from django.shortcuts import render, redirect, get_object_or_404 from django.http import Http404, HttpResponse from django.conf import settings from django.core.exceptions import PermissionDenied, ValidationError from django.contrib.auth.mixins import LoginRequiredMixin from django.contrib.admin.views.decorators import staff_member_required from django.views.generic.list import ListView from .models import Video, Audio, VideoPlaylist, AudioPlaylist, Text, Upload, SiteSetting import uuid from .panopto import panopto_oauth2 from html_sanitizer import Sanitizer html_sanitizer = Sanitizer() html = html_sanitizer.sanitize def shib_bounce(request): """This view is for bouncing the user to the desired location after they have authenticated with Shibboleth and been bounced back to /login. Assumes that a 'next' argument has been set in the URL. E.g. '/login?next=/inventory/'. PersistentRemoteUserMiddleware logs the user in automatically, so there is no need for this view to do this work manually. """ try: next = request.GET['next'] except KeyError: raise Http404("No bounce destination.") if request.user.is_authenticated: return redirect(next) else:
import sys import traceback from flask import Flask, request, render_template, send_from_directory, Response import validators from util.validate import validate_email from util.validate import validate_phone from util.db import db from html_sanitizer import Sanitizer import time import util.env from util.mail import send_email from requests import get, post from os import environ sanitizer = Sanitizer() app = Flask(__name__) app.secret_key = 'Zli6WMDUEboJnp34fzwK'.encode('utf8') @app.route('/assets/<path>') def send_assets(path): return send_from_directory('assets', path) @app.route('/assets/icons/<path>') def send_icons(path): return send_from_directory('assets/icons', path)
from html_sanitizer import Sanitizer # import the logging library import logging # Get an instance of a logger logger = logging.getLogger(__name__) # Limit results for ElasticSearch to this number by Default ES_DEFAULT_LIMIT = 50 sanitizer = Sanitizer({ 'tags': {'a',}, 'attributes': { 'a': ('href', 'name', 'target', 'title', 'id'), }, 'empty': set(), 'separate': {'a',}, }) class EmailController(): """ Email utility controller, sends emails based on templates """ # Get an instance of a logger logger = logging.getLogger("EmailController") sender = u'OffenesParlament <*****@*****.**>' fail_silently = False
sanitizer = Sanitizer({ 'tags': ( 'a', 'b', 'blockquote', 'br', 'center', 'code', 'del', 'em', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'img', 'i', 'li', 'ol', 'pre', 'p', 'strike', 'strong', 'sup', 'sub', 'ul', ), 'attributes': { "a": ("href", "name", "title", "id", "rel"), "img": ("src", "width", "height", "alt"), }, 'element_preprocessors': [], 'sanitize_href': sanitize_href, })
# CATEGORY_STRING = "Grace Hopper" # MANIFEST_URL = "http://example.com" CATEGORY_STRING = sys.argv[1] MANIFEST_URL = sys.argv[2] COMMONS_CAT_TEMPLATE = u"https://commons.wikimedia.org/w/api.php?action=query&\ generator=categorymembers&iiurlwidth={0}&gcmtitle=\ Category:{1}&gcmlimit=500&gcmtype=file&prop=imageinfo&\ iiprop=url|timestamp|user|mime|extmetadata&format=json" HEADERS = {'user-agent': 'Science Stories API ([email protected])'} sanitizer = Sanitizer({ 'tags': {'a', 'b', 'br', 'i', 'img', 'p', 'span'}, 'attributes': { 'a': ('href'), 'img': ('src', 'alt') }, 'empty': {'br'}, 'separate': {'a', 'p'} }) html_converter = HTML2Text() html_converter.ignore_links = True def main(): """Call Main Function.""" return iiif_cat_manifest(CATEGORY_STRING) def safe_str(obj): """Return unicode encoding."""
f.write(full_html) for a in articles: app_body = '' print(f"Process {a['url']}...") article = json.loads( subprocess.getoutput(f"../api/read-article {a['url']}")) article['url'] = a['url'] article['body'] = markdown.markdown(article['body'], extensions=md_extensions, output_format='html5') article['body'] = update_link(article['body'], a['url']) article['comments_body'] = '' for comment in article.get('comments', {}): sanitizer = Sanitizer() comment['body'] = markdown.markdown(comment['body'], extensions=md_extensions, output_format='html5') comment['body'] = sanitizer.sanitize(comment['body']) if comment['site']: if comment['site'].startswith( 'https://') or comment['site'].startswith('http://'): comment['site'] = sanitizer.sanitize(comment['site']) else: comment['site'] = None article['comments_body'] += comment_tpl(comment) app_body += article_tpl(article) full_html = main_tpl({ 'body': app_body,
#!C:\Users\msy94\AppData\Local\Programs\Python\Python37\python.exe #!python3 #-*- coding: utf-8 -*- import sys, codecs, os sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach()) import cgi, cgitb import view cgitb.enable() print("Content-Type: text/html; charset=utf-8\r\n") print() form = cgi.FieldStorage() from html_sanitizer import Sanitizer sanitizer = Sanitizer() if "id" in form: title = pageId = form["id"].value description = open("data/"+pageId, encoding="utf-8").read() description = sanitizer.sanitize(description) title = sanitizer.sanitize(title) update_link = "<a href = 'update.py?id={pageId}'>update</a>".format(pageId = pageId) delete_action = ''' <form action="process_delete.py" method="post"> <input type = "hidden" name = "pageId" value="{}"> <input type = "submit" value = "delete"> </form> '''.format(pageId) else: title = pageId = "Welcome" description = "Hello Web" update_link = "" delete_action = "" print(pageId)
sanitizer = Sanitizer({ "tags": { "a", "b", "blockquote", "br", "div", "em", "h1", "h2", "h3", "hr", "i", "li", "ol", "p", "span", "strong", "sub", "sup", "ul", "img", }, "attributes": { "a": ("href", "name", "target", "title", "id", "rel", "src", "style") }, "empty": {"hr", "a", "br", "div"}, "separate": {"a", "p", "li", "div"}, "whitespace": {"br"}, "keep_typographic_whitespace": False, "add_nofollow": False, "autolink": False, })