def endElement(self, tag): tag, attrs, data = self._history.pop() data = data.strip() if untranslated._translatable( data) and not untranslated._tal_replaced_content(tag, attrs): # not enclosed if (self._i18nlevel == 0) and tag not in ['script', 'style', 'html']: severity = untranslated._severity(tag, attrs) or '' if severity: if untranslated.IGNORE_UNTRANSLATED in attrs.keys(): # Ignore untranslated data. This is necessary for # including literal content, that does not need to be # translated. pass elif not untranslated.CHAMELEON_SUBST.match(data): h = HTMLParser.HTMLParser() with open(self._filename, 'r') as source_file: bs = BeautifulSoup.BeautifulSoup( source_file, 'html.parser') source_file.close() attr = {} for key in attrs.keys(): if key not in ['selected']: attr[key] = attrs.getValue(key) values = bs.findAll(tag.lower(), attrs=attr) if not values: self.log( 'i18n:translate missing for this:\n' '"""\n%s\n"""\nTag:<%s> Attrs:%s' % (data.encode('utf8'), tag, attr), severity) for v in values: if not v.has_attr('i18n:translate'): v.name = tag escaper = EntitySubstitution() substitute = copy(v) if v.string: substitute.string = escaper.substitute_html( v.string) for i in [v, substitute]: pattern = h.unescape(str(i)) i['i18n:translate'] = "" substring = h.unescape(str(i)) match = replace( self._filename, str(pattern), str(substring), self._parser.getLineNumber()) if match: break if not match: self.log( 'i18n:translate missing for this:\n' '"""\n%s\n"""\nPattern: %s' % (data.encode('utf8'), str(pattern)), severity) if self._i18nlevel != 0: self._i18nlevel -= 1
def parse_warning_page(page_source, warning): """ Pare source code of a warning HTML page and extract key, name and description. """ # parse HTML page soup = BeautifulSoup(page_source, 'html.parser') content = soup.find('main') # use header, sometimes only message ID key = warning['key'] warning['name'] = name(content.find('h1'), key, key) # sometimes better description inside blockquote warning['name'] = name(content.select_one('blockquote > p'), key, warning['name']) desc = '' for paragraph in content.select('main > p'): txt = str(paragraph) if 'Compiler Warning ' in warning['name']: # compiler messages: first p element is header if len(txt) < 200: warning['name'] = name(paragraph, key, warning['name']) else: desc += txt break else: # use only first p block: XML otherwise becomes too large desc += txt break if not desc: # repeat header in description to have something desc = '<p>' + EntitySubstitution().substitute_html(warning['name']) + '</p>' warning['description'] = desc return warning
def xml_escape(string): """ formatter function for writing to file """ # it seems to be a problem form bs to escape stuff and selfclosing tags ret = EntitySubstitution.substitute_xml(string) ret = ret.replace('"', '"') ret = ret.replace("'", ''') return ret
def make_opening_tag(self, tag_node, self_closing=False): components = [tag_node.name] for attr, value in tag_node.attrs.iteritems(): if isinstance(value, list): value = u" ".join(value) components.append(u'%s="%s"'%(attr, EntitySubstitution.substitute_html(value))) start = u"<" end = u"/>" if self_closing else u">" return start + u" ".join(components) + end
def save_sent_viz_file(x, name, scores, k, args): escaper = EntitySubstitution() with open(os.path.join(args.outdir, 'id_to_word.pkl'),'rb') as f: id_to_word = pickle.load(f) new_data = list() new_texts = list() with open(os.path.join(args.outdir, 'sent_viz_L2X'+name+'.html'), 'w') as txt_file: txt_file.write(u"<!DOCTYPE html>\n<html>\n<body>\n".encode("utf-8")) for i, x_single in enumerate(x): x_selected = get_selected_words(x_single, scores[i], id_to_word, k) # new_data.append(x_selected) for s_i, s in enumerate(x_single): if s != 0: break # txt_file.write( (u" ".join([id_to_word[i] for i in x_single[s_i:] if i != 0]) + u"\n").encode("utf-8") ) sent_viz = list() for wp, wi in enumerate(x_single[s_i:]): # if x_selected[s_i:][wp] != 0: # placeholder = u"-" * len(id_to_word[wi]) # else: # placeholder = u" " * len(id_to_word[wi]) if x_selected[s_i:][wp] != 0: placeholder = u"<mark><strong>" + escaper.substitute_html(id_to_word[wi]) + u"</strong></mark>" else: placeholder = escaper.substitute_html(id_to_word[wi]) sent_viz.append(placeholder) txt_file.write((u"<p>" + u" ".join(sent_viz) + u"</p><br>\n").encode("utf-8")) txt_file.write(u"</body>\n</html>\n".encode("utf-8"))
def extract_content(self): soup = BeautifulSoup(self.html, from_encoding="utf-8") content = soup.find("div", {"id" : "contenu"}) #print dir(content) #print help(content) for x in dir(content): print "\t", x.title() ,type(getattr(content, x)) print content.parent #print type(content) if content is not None: #todo test all function to find the best #print content.string #print content.contents #print content.getText() #print content.get_text() #Get all child strings, concatenated using the given separator. #print content.getText() #print content.Text() #print content.Name() #print content.strings # Get all child strings, concatenated using the given separator. print "title", soup.title.string with open("wiki_content.html", "wb") as myfile: myfile.write(EntitySubstitution.substitute_html(unicode(content)).encode("UTF-8")) #myfile.write(EntitySubstitution.substitute_html(unicode(content)).encode("UTF-8")) #myfile.write(unicode(content.prettify(formatter="html")).encode("UTF-8")) """ for elem in content.contents: print type(elem) myfile.write(elem.encode("UTF-8")) """ print "content",type(content) #self.content = content.string self.content = content.contents print type(self.content) print len(self.content)
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, basestring): val = unicode(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( unicode(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' prefix = '' if self.prefix: prefix = self.prefix + ":" if self.is_empty_element: close = '/' else: closeTag = '</%s%s>' % (prefix, self.name) pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) if self.hidden: s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = ''.join(s) return s
def uppercase_and_substitute_html_entities(string): #string = string.encode('utf-8') HTMFormatted = EntitySubstitution.substitute_html(string) HTMFormatted = HTMFormatted.replace('\n', '') #print "str = [", string , "] HTMFormatted = [", HTMFormatted, "]" return HTMFormatted
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, substitute_html_entities=False): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if not isinstance(val, basestring): val = str(val) if (self.contains_substitutions and eventual_encoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventual_encoding) decoded = (key + '=' + EntitySubstitution.substitute_xml(val, True)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = ' /' else: closeTag = '</%s>' % self.name pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level-1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, substitute_html_entities) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s>' % (self.name, attributeString, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.nextSibling: s.append("\n") s = ''.join(s) return s
def output_ready(self, substitute_html_entities=False): if substitute_html_entities: output = EntitySubstitution.substitute_html(self) else: output = self return self.PREFIX + output + self.SUFFIX
html.append(body) return html def cute_country(country_code): cute = { # 'CA': "🇨🇦", 'CA': "🏒", 'CH': "🇨🇭", 'US': "🇺🇸", "TW": "🐉", } return cute.get(country_code, country_code) escaper = EntitySubstitution() def cute_stype(x): cute = { 'laptop': u"💻", 'duckiebot': u"🚗" } s = cute.get(x, x) return escaper.substitute_html(s) if __name__ == '__main__': filename = sys.argv[1] if len(sys.argv) >= 3: output = sys.argv[2] else:
# @viticci # A simple HTML encoder for clipboard contents # -*- coding: utf-8 -*- import clipboard text = clipboard.get() from bs4.dammit import EntitySubstitution print EntitySubstitution.substitute_html(text)
def output_dom(self, tag): attrs = [] if tag.attrs: for key, val in tag.attrs.iteritems(): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, basestring): val = unicode(val) elif isinstance(val, AttributeValueWithCharsetSubstitution): val = val.encode('utf-8') text = tag.format_string(val) decoded = ( unicode(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' close_tag = '' if tag.is_empty_element: close = '/' else: close_tag = '</%s>' % tag.name attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) self.dom.write('<%s%s%s>' % (tag.name, attribute_string, close)) is_visible_string = lambda s: (isinstance(s, NavigableString) and not isinstance(s, Comment)) if tag.contents: has_print = False if tag.name == 'pre': self.dom.write(tag.encode_contents(encoding='utf-8')) elif len(tag.contents) == 1: sub_tag = tag.contents[0] if isinstance(sub_tag, Tag): if not has_print: self.dom.write('\n') has_print = True self.output_dom(sub_tag) elif is_visible_string(sub_tag): self.dom.write(sub_tag.output_ready(formatter='html') .strip().encode('utf-8')) else: for sub_tag in tag.contents: if isinstance(sub_tag, Tag): if not has_print: self.dom.write('\n') has_print = True self.output_dom(sub_tag) elif is_visible_string(sub_tag) and not sub_tag.isspace(): prefix = postfix = '' if sub_tag[0].isspace(): prefix = ' ' if sub_tag[-1].isspace(): postfix = ' ' self.dom.write( prefix + sub_tag.output_ready(formatter='html') .strip().encode('utf-8') + postfix) self.dom.write(close_tag) self.dom.write('\n')
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ # First off, turn a string formatter into a function. This # will stop the lookup from happening over and over again. if not callable(formatter): formatter = self._formatter_for_name(formatter) attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, basestring): val = unicode(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( unicode(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' prefix = '' if self.prefix: prefix = self.prefix + ":" if self.is_empty_element: close = '/' else: closeTag = '</%s%s>' % (prefix, self.name) pretty_print = self._should_pretty_print(indent_level) space = '' indent_space = '' if indent_level is not None: indent_space = (' ' * (indent_level - 1)) if pretty_print: space = indent_space indent_contents = indent_level + 1 else: indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if indent_level is not None: # Even if this particular tag is not pretty-printed, # we should indent up to the start of the tag. s.append(indent_space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if indent_level is not None and closeTag and self.next_sibling: # Even if this particular tag is not pretty-printed, # we're now done with the tag, and we should add a # newline if appropriate. s.append("\n") s = ''.join(s) return s
def substitute_html_entities(str): # return EntitySubstitution.substitute_html(str) # return EntitySubstitution.substitute_html(str).replace("’","\'").replace("“","\"").replace("â€","\"").replace(">",">")\ return EntitySubstitution.substitute_html(str).replace("“","\"").replace("”","\"").replace("’","'")
def envioMail(): print "Comenzamos envioMail" import base64 from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText from email.mime.image import MIMEImage # Establecemos conexion con el servidor smtp de gmail mailServer = smtplib.SMTP('smtp.gmail.com',587) mailServer.ehlo() mailServer.starttls() mailServer.ehlo() password = base64.b64decode("Q29uc3RhbmNpYTIx") mailServer.login("*****@*****.**",password) # Construimos un mensaje Multipart, con un texto y una imagen adjunta # Establecemos la cuentadesde cuentaDesde = "*****@*****.**" from pymongo import MongoClient as Connection from pymongo import DESCENDING cadenaCon= 'mongodb://*****:*****@ds029635.mlab.com:29635/othesoluciones1' MONGODB_URI =cadenaCon MONGODB_URI = 'mongodb://*****:*****@ds029635.mlab.com:29635/othesoluciones1' db = Connection(MONGODB_URI).othesoluciones1 import datetime import numpy as np import pandas as pd fecha = (datetime.date.today()+datetime.timedelta(days=0)).strftime('%d/%m/%Y') fecha = datetime.datetime.strptime(fecha,'%d/%m/%Y') print "Fecha de hoy-->", fecha dfmm = pd.DataFrame() for doc in db.coleccion_notificaciones.find(): if ((datetime.datetime.strptime(doc['fdesde'],'%d/%m/%Y')<= fecha) and (fecha <= datetime.datetime.strptime(doc['fhasta'],'%d/%m/%Y'))): df_aux=pd.DataFrame([doc['email'],doc['municipio'], doc['fhasta']]) dfmm= dfmm.append(df_aux.T, ignore_index=True) print "****************************************************************" from lxml import etree import time doc=etree.parse("static/Municipios/madrid.xml") muni=doc.findall("municipio") print dfmm if (len(dfmm)>0): print "Existen notificaciones que enviar" #Obtenemos la lista de emails distintos for j in dfmm[0].unique(): # Construimos un mensaje Multipart, en el que vamos a incluir texto y una imagen adjunta # El cuerpo del texto del mensaje dependera del numero de suscripciones activas que tenga un usuario para el dia actual texto="" mensaje = MIMEMultipart() mensaje['From']=cuentaDesde cuentaPara=j mensaje['To']=cuentaPara for i in range(0, len(dfmm)): if (dfmm.ix[i,0]==j): for k in range(0,len(muni)): if (muni[k].attrib["value"][-5:]==dfmm.ix[i,1]): hoy = (datetime.date.today()+datetime.timedelta(days=0)).strftime('%d-%m-%Y') manana=(datetime.date.today()+datetime.timedelta(days=1)).strftime('%d-%m-%Y') pasadomanana=(datetime.date.today()+datetime.timedelta(days=2)).strftime('%d-%m-%Y') collection1 = db.PrediccionOTHE name2 = elimina_tildes(unicode(muni[k].text[:])) cursor1 = collection1.find_one({"Municipio": name2}) predHoy = cursor1["Alerta "+hoy] predManana= cursor1["Alerta "+manana] predPasadoManana=cursor1["Alerta "+pasadomanana] from bs4.dammit import EntitySubstitution unsubbed = unicode(muni[k].text[:]) esub = EntitySubstitution() subbed = esub.substitute_html(unsubbed) print "Activa hasta el: ", dfmm.ix[i,2] fhasta = str(dfmm.ix[i,2]).replace("/","-") texto = texto+str("<h3>"+subbed+":</h3><p> </p>") texto = texto+str("<p>El Nivel de Alerta de Gramíneas para el día " +hoy+" es: <b>"+str((predHoy))+"</b></p>") texto = texto+str("<p>El Nivel de Alerta de Gramíneas para el día " +manana+" es: <b>"+str((predManana))+"</b></p>") texto = texto+str("<p>El Nivel de Alerta de Gramíneas para el día " +pasadomanana+" es: <b>"+str((predPasadoManana))+"</b></p>") if (hoy!=fhasta): texto = texto+str("<p>Recibirá esta notificación hasta el: <b>"+fhasta+"</b></p>") else: texto = texto+str("<p>Hoy día <b>"+fhasta+"</b> es el último en el que recibirá esta notificación</p>") texto = texto+str("<hr>") #Establecemos el Asunto del Email mensaje['Subject']= hoy+". Servicio de Notificaciones" #Establecemos el texto comun de los emails html_inic = """\ <html> <head></head> <body> <p>Buenos días,</p> <p>Estas son las notificaciones que ha solicitado:</p><br></br>""" html_fin="""\ <br></br> <p>Deseamos que pase un gran día.</p> <p>Para más información puede consultar nuestra web: http://gramineas-madrid.herokuapp.com/</p> <p>Reciba un cordial saludo por parte del equipo de Othe Soluciones</p> <img src="cid:logo" alt="Othe Soluciones" height="52" width="52"></img> </html>""" #Y lo juntamos en una cadena html=str(html_inic+texto+html_fin) #Montamos todo el cuerpo del mensaje mensaje.attach(MIMEText(html,'html')) # Adjuntamos la imagen file = open("static/style/logo.jpg", "rb") contenido = MIMEImage(file.read()) contenido.add_header('Content-ID', '<logo>') mensaje.attach(contenido) print "Envio mail a: ", cuentaPara # Enviamos el correo, con los campos from y to. mailServer.sendmail(cuentaDesde, cuentaPara, mensaje.as_string()) # Cierre de la conexion mailServer.close() print "Fin de envioMail con emails enviados" else: # Cierre de la conexion mailServer.close() print "Fin de envioMail no habia emails que enviar"
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = " ".join(val) elif not isinstance(val, basestring): val = str(val) if self.contains_substitutions and eventual_encoding is not None and "%SOUP-ENCODING%" in val: val = self.substitute_encoding(val, eventual_encoding) decoded = str(key) + "=" + EntitySubstitution.substitute_xml(val, True) attrs.append(decoded) close = "" closeTag = "" if self.is_empty_element: close = "/" else: closeTag = "</%s>" % self.name prefix = "" if self.prefix: prefix = self.prefix + ":" pretty_print = indent_level is not None if pretty_print: space = " " * (indent_level - 1) indent_contents = indent_level + 1 else: space = "" indent_contents = None contents = self.decode_contents(indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = "" if attrs: attribute_string = " " + " ".join(attrs) if pretty_print: s.append(space) s.append("<%s%s%s%s>" % (prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = "".join(s) return s
def _html_entities(self, string): if '&' in string: return string else: return EntitySubstitution.substitute_html(string)
from bs4.dammit import EntitySubstitution esub = EntitySubstitution() def sanitize_html(title): return esub.substitute_html(title) def sanitize_irc(title): badchars = "\r\n\x01" return "".join(c for c in title if c not in badchars) escapers = {"html": sanitize_html, "irc": sanitize_irc} def escape(title, mode): if not mode: mode = "irc" if mode == "all": for func in list(escapers.values()): title = func(title) return title return escapers.get(mode, lambda title: title)(title)
def output_ready(self, substitute_html_entities=False): if substitute_html_entities: output = EntitySubstitution.substitute_html(self) else: output = self return self.PREFIX + output + self.SUFFIX
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, substitute_html_entities=False): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if not isinstance(val, basestring): val = str(val) if (self.contains_substitutions and eventual_encoding is not None and '%SOUP-ENCODING%' in val): val = self.substituteEncoding(val, eventual_encoding) decoded = (key + '=' + EntitySubstitution.substitute_xml(val, True)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = ' /' else: closeTag = '</%s>' % self.name pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents(indent_contents, eventual_encoding, substitute_html_entities) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attributeString = '' if attrs: attributeString = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s>' % (self.name, attributeString, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.nextSibling: s.append("\n") s = ''.join(s) return s
import time import string from twitter import * import yweather import codecs import io import shutil import sys from HTMLParser import HTMLParser # HTML unescaping via standard lib from bs4.dammit import EntitySubstitution # HTML escaping, via BeautifulSoup4 reload(sys) sys.setdefaultencoding('utf-8') htmlparser = HTMLParser() esub = EntitySubstitution() html = """ <!DOCTYPE html> <html > <head> <meta charset="UTF-8"> <title>Bhagya</title> <style type="text/css"> #mtabs_wrapper { width: 100%%;
# -*- coding: utf-8 -*- from __future__ import print_function import clipboard text = clipboard.get() from bs4.dammit import EntitySubstitution print(EntitySubstitution.substitute_html(text))
def encode_url(string): encoder = EntitySubstitution() return encoder.substitute_html(string)
#tempo[cle] = [] tempo[cle2].append(tempoClass[cle2]) tempo2[cle2].append(tempoClass[cle2]) else: tempo[cle2] = [] tempo2[cle2] = [] tempo[cle2].append(tempoClass[cle2]) tempo2[cle2].append(tempoClass[cle2]) else: temp = unicode(' '.join(brev[cle])) tempo[cle] = temp tempo2[cle] = brev[cle] elif cle == 'titre': temp = unicode(brev[cle]).replace('[', '').replace( ']', '').lower().capitalize() formate = EntitySubstitution() soup = bs4.BeautifulSoup(temp) temp = soup.text tempo[cle] = temp #tempo2 [cle] = temp elif cle == 'date': tempo[cle] = str(brev['date'].year) + '-' + str( brev['date'].month) + '-' + str(brev['date'].day) tempo2[cle] = str( brev['date'].year) # just the year in Pivottable elif cle == 'classification' and brev['classification'] != u'': tempoClass = OPS2NetUtils2.ExtractClassificationSimple2( brev['classification']) for cle in tempoClass.keys(): if cle in tempo.keys( ) and tempoClass[cle] not in tempo[cle]:
def main(): escaper = EntitySubstitution() form = cgi.FieldStorage() print("Content-type: text/html\n\n") print(''' <html> <head> <title>BitsxlaMarato 2020 - La FrancoArgentina Team</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <script type="text/javascript" src="/jquery/jquery-3.3.1.min.js"></script> <link rel="stylesheet" type="text/css" href="/jquery/DataTables/bootstrap.min.css"/> <link href="/jquery/DataTables/DataTables-1.10.18/css/jquery.dataTables.css" rel="stylesheet" type="text/css" /> <script src="/jquery/DataTables/DataTables-1.10.18/js/jquery.dataTables.js"></script> <link rel="stylesheet" type="text/css" href="/jquery/DataTables/dataTables.bootstrap.min.css"/> <script src="stylesheet" type="text/css" href="/jquery/DataTables/dataTables.js"/></script> <style> .blue-button { display: inline-block; -webkit-box-sizing: content-box; -moz-box-sizing: content-box; box-sizing: content-box; cursor: pointer; padding: 5px 15px; border: 1px solid #018dc4; -webkit-border-radius: 3px; border-radius: 3px; font: normal 16px/normal "Times New Roman", Times, serif; color: rgba(255,255,255,0.9); -o-text-overflow: clip; text-overflow: clip; background: #787A7D; -webkit-box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ; box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ; text-shadow: -1px -1px 0 rgba(15,73,168,0.66) ; -webkit-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); -moz-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); -o-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); } .light-blue-button { display: inline-block; -webkit-box-sizing: content-box; -moz-box-sizing: content-box; box-sizing: content-box; cursor: pointer; padding: 2px 8px; border: 1px solid #018dc4; -webkit-border-radius: 3px; border-radius: 3px; font: normal 12px/normal "Times New Roman", Times, serif; color: rgba(255,255,255,0.9); -o-text-overflow: clip; text-overflow: clip; background: #a6cfe0; -webkit-box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ; box-shadow: 3px 3px 5px 0 rgba(0,0,0,0.2) ; text-shadow: -1px -1px 0 rgba(15,73,168,0.66) ; -webkit-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); -moz-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); -o-transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); transition: all 300ms cubic-bezier(0.42, 0, 0.58, 1); } .text-input { display: inline-block; -webkit-box-sizing: content-box; -moz-box-sizing: content-box; box-sizing: content-box; padding: 4px 10px; border: 1px solid #b7b7b7; margin-bottom: 30px; -webkit-border-radius: 3px; border-radius: 3px; font: normal 16px/normal "Times New Roman", Times, serif; color: rgba(0,142,198,1); -o-text-overflow: clip; text-overflow: clip; letter-spacing: 1px; word-spacing: 2px; background: rgba(234,234,234,1); -webkit-box-shadow: 2px 2px 2px 0 rgba(0,0,0,0.2) inset; box-shadow: 2px 2px 2px 0 rgba(0,0,0,0.2) inset; text-shadow: 1px 1px 0 rgba(255,255,255,0.66) ; -webkit-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1); -moz-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1); -o-transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1); transition: all 200ms cubic-bezier(0.42, 0, 0.58, 1); } #title-1 { font-family: Verdana, Geneva, sans-serif; font-size: 24px; letter-spacing: 0.4px; word-spacing: 0px; color: #000000; font-weight: 700; text-decoration: none; font-style: normal; font-variant: normal; text-transform: none; } #title-2 { font-family: Verdana, Geneva, sans-serif; font-size: 20px; letter-spacing: 0.4px; word-spacing: 0px; color: #000000; font-weight: 700; text-decoration: none; font-style: normal; font-variant: normal; text-transform: none; vertical-align: middle; text-align: center; } #title-3 { font-family: Verdana, Geneva, sans-serif; font-size: 12px; letter-spacing: 0.4px; word-spacing: 0px; color: #000000; font-weight: 700; text-decoration: none; font-style: normal; font-variant: normal; text-transform: none; } #all-content { margin: auto; } .center { text-align: center; } .row { min-height: 100px; position: relative; text-align: center; } .column_center { display: inline-block; padding: 20px; border:1px solid red; } label { float: center; margin: 10 30px; } </style> </head> ''') print('''<body> <div id="all-content"> <div class="row"> <div id="title-2">BitsxlaMarato 2020 - La FrancoArgentina Team</div> <br> <br> ''') if form.getfirst('action_on_post', None) == "clinical_description": case_id, case_desc, doc_simil_clust, top_cluster, topics_all_clusters, topics_top_cluster = find_cluster_newcase( case_id=None, case_desc=form.getfirst('clinical_desc')) abc = 1 elif form.getfirst('action_on_post', None) == "case_id": case_id, case_desc, doc_simil_clust, top_cluster, topics_all_clusters, topics_top_cluster = find_cluster_newcase( case_id=form.getfirst('clinical_id'), case_desc=None) abc = 1 elif form.getfirst('action_on_post', None) == "keyword": clusters, topics_all_clusters = find_keyword( keyword=form.getfirst('keyword')) abc = 2 else: print(''' Nothing to do ''') if abc == 1: print("<table border=0>") print(f"<tr style='text-align:left'>") print("<td style='width:20%; vertical-align:top'>") print("<label>") print(f"<b>Case ID:</b>") print("</label>") print("</td>") print("<td style='width:85%'>") print(f"<b>{str(case_id)}") print("</td>") print("</tr>") print(f"<tr style='text-align:left'>") print("<td style='width:20%; vertical-align:top'>") print("<label>") print(f"<b>Case description:</b>") print("</label>") print("</td>") print("<td style='width:85%'>") print(f"{escaper.substitute_html(case_desc)}") print("</td>") print("</tr>") print(f"<tr style='text-align:left'>") print("<td style='width:20%; vertical-align:top'>") print("<label>") print(f"<b>Assigned to cluster:</b>") print("</label>") print("</td>") print("<td style='width:85%'>") print(f"{escaper.substitute_html(str(top_cluster))}") print("</td>") print("</tr>") print(f"<tr style='text-align:left'>") print("<td style='width:20%; vertical-align:top'>") print("<label>") print("<b>Topics in the assigned cluster:</b>") print("</label>") print("</td>") print("<td style='width:85%'>") for index, row in topics_top_cluster.iterrows(): print(f"{escaper.substitute_html(str(row[1]))}<br>") print(f"</td>") print(f"</tr>") print(f"<tr style='text-align:left'>") print("<td style='width:20%; vertical-align:top'>") print("<label>") print("<b>Mean pairwise distance to each cluster:</b>") print("</label>") print("</td>") print("<td>") print("<table border='0'>") print("<tr>") print("<td style='width:45%; text-align:center'><b>Cluster</b></td>") print("<td style='width:55%; text-align:center'><b>Distance</b></td>") print("</tr>") for index, row in doc_simil_clust.iterrows(): print("<tr>") print( f"<td style='width:45%; text-align:center'><b>{escaper.substitute_html(str(int(row[0])))}</b></td>" ) print( f"<td style='width:55%; text-align:right'>{escaper.substitute_html(str(row[1]))}</td>" ) print(f"</tr>") print("</table>") print("</td>") print("</table>") print("<hr>") print("<table border=0>") print("<tr style='text-align:left'>") print("<td style='width:20%; vertical-align:top'>") print("<label>") print("<b>Topics in all clusters:</caption>") print("</label>") print("</td>") print("<td>") print("<table border='0'>") print("<tr>") print("<th style='text-align:center'><b>Cluster</b></th>") print("<th style='text-align:center'><b>Topics</b></th>") print("</tr>") a = None for index, row in topics_all_clusters.iterrows(): if (a is None): print(f"<tr>") print( f"<td style='text-align:center'>{escaper.substitute_html(str(row[0]))}</td>" ) print("<td>") print(f"{escaper.substitute_html(str(row[1]))}<br>") a = row[0] if (a != row[0]): print("</td>") print(f"</tr>") print("<tr><td colspan=2><hr></td></tr>") print(f"<tr>") print( f"<td style='text-align:center'>{escaper.substitute_html(str(row[0]))}</td>" ) print("<td>") print(f"{escaper.substitute_html(str(row[1]))}<br>") a = row[0] else: print(f"{escaper.substitute_html(str(row[1]))}<br>") print("</td>") print(f"</tr>") print("</table>") print("</td>") print("</table>") print("<br>") else: #DTD #print(f"Clusters: {clusters.items()}") if len(clusters) > 0: print( "<center><h2>Keyword found in the following clusters:</h2></center>" ) print("<center>") print( "<table border=0><tr><th calss='text-center'>Cluster</th><th class='text-center'>Topics</th></tr>" ) for row in clusters: print( f"<tr><td align='center'><b>{row['cluster']}</b></td><td>{row['topics']}</td></tr>" ) print("</table>") print( "<center><h3>Look for these clusters below in order to find all the words in each of them.</h3></center>" ) print("</center>") else: print("<center><h2>Keyword not found in any cluster</h2></center>") print("<br><hr>") print("<center>") print("<b>Topics in all clusters:</caption>") print("</label>") print("</td>") print("<td>") print("<table border='0'>") print("<tr>") print("<th style='text-align:center'><b>Cluster</b></th>") print("<th style='text-align:center'><b>Topics</b></th>") print("</tr>") a = None b = None #print("<center><h2>All clusters in the model:</h2></center>") for index, row in topics_all_clusters.iterrows(): if (a is None) & (b is None): print(f"<tr>") print( f"<td style='text-align:center'><b>{escaper.substitute_html(str(row[0]))}</b></td>" ) print("<td>") print(f"{escaper.substitute_html(str(row[1]))}<br>") a = row[0] b = row[1] if (a != row[0]) & (b != row[1]): print("</td>") print(f"</tr>") print("<tr><td colspan=2><hr></td></tr>") print(f"<tr>") print( f"<td style='text-align:center'><b>{escaper.substitute_html(str(row[0]))}</b></td>" ) print("<td>") print(f"{escaper.substitute_html(str(row[1]))}<br>") a = row[0] b = row[1] else: print(f"{escaper.substitute_html(str(row[1]))}<br>") print("</td>") print(f"</tr>") print("</table>") print("</center>") print("</td>") print("</table>") print("<br>") print(''' <hr> <center> <table border=0 height="100px" width="60%"> <tr> <td> <img src="/images/FIB-web.png" height="60%"> </td> <td> <img src="/images/hackers-upc-web.png" height="60%"> </td> <td> <img src="/images/bsc-web.png" height="60%"> </td> <td> <img src="/images/plan-tl-web.png" height="60%"> </td> </tr> </table> </center> ''') print("</html>")
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, basestring): val = str(val) elif ( isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = ( str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = '/' else: closeTag = '</%s>' % self.name prefix = '' if self.prefix: prefix = self.prefix + ":" pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents( indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s%s>' % ( prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = ''.join(s) return s
def custom_formatter(string): """add " and ' to entity substitution""" return EntitySubstitution.substitute_html(string).replace( '"', '"').replace("'", ''')
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"): """Returns a Unicode representation of this tag and its contents. :param eventual_encoding: The tag is destined to be encoded into this encoding. This method is _not_ responsible for performing that encoding. This information is passed in so that it can be substituted in if the document contains a <META> tag that mentions the document's encoding. """ attrs = [] if self.attrs: for key, val in sorted(self.attrs.items()): if val is None: decoded = key else: if isinstance(val, list) or isinstance(val, tuple): val = ' '.join(val) elif not isinstance(val, str): val = str(val) elif (isinstance(val, AttributeValueWithCharsetSubstitution) and eventual_encoding is not None): val = val.encode(eventual_encoding) text = self.format_string(val, formatter) decoded = (str(key) + '=' + EntitySubstitution.quoted_attribute_value(text)) attrs.append(decoded) close = '' closeTag = '' if self.is_empty_element: close = '/' else: closeTag = '</%s>' % self.name prefix = '' if self.prefix: prefix = self.prefix + ":" pretty_print = (indent_level is not None) if pretty_print: space = (' ' * (indent_level - 1)) indent_contents = indent_level + 1 else: space = '' indent_contents = None contents = self.decode_contents(indent_contents, eventual_encoding, formatter) if self.hidden: # This is the 'document root' object. s = contents else: s = [] attribute_string = '' if attrs: attribute_string = ' ' + ' '.join(attrs) if pretty_print: s.append(space) s.append('<%s%s%s%s>' % (prefix, self.name, attribute_string, close)) if pretty_print: s.append("\n") s.append(contents) if pretty_print and contents and contents[-1] != "\n": s.append("\n") if pretty_print and closeTag: s.append(space) s.append(closeTag) if pretty_print and closeTag and self.next_sibling: s.append("\n") s = ''.join(s) return s
def substitute_html_entities(str): return EntitySubstitution.substitute_html(str).replace("“","\"").replace("”","\"").replace("’","'")