def parse(self, response): b = response.css('.templateContainer') page = response.url.split("/")[-1] filename = 'scraper/posto-%s.html' % page with open(filename, 'wb') as f: f.write(sanitize(b.get()).encode()) self.log('Saved file %s' % filename)
def convert(self, data, cache, **kwargs): bodydom = Element('div') kmldom = XML(data) ns = kmldom.tag.strip('kml') placemarks = kmldom.findall('.//%sPlacemark' % ns) for placemark in placemarks: titles = placemark.findall(ns + 'name') for title in titles: t = Element('h2') t.text = title.text bodydom.append(t) descriptions = placemark.findall(ns + 'description') for desc in descriptions: if desc.text: try: text = desc.text.encode('ascii', 'xmlcharrefreplace').strip() except: text = desc.text.strip() text = sanitize(text) d = XML('<div>' + text.encode('ascii', 'xmlcharrefreplace') + '</div>') bodydom.append(d) body = tostring(bodydom) cache.setData(body) return cache
def convert(self, data, cache, **kwargs): bodydom = Element('div') kmldom = XML(data) ns = kmldom.tag.strip('kml') placemarks = kmldom.findall('.//%sPlacemark' % ns) for placemark in placemarks: titles = placemark.findall(ns + 'name') for title in titles: t = Element('h2') t.text = title.text bodydom.append(t) descriptions = placemark.findall(ns+'description') for desc in descriptions: if desc.text: try: text = desc.text.encode('ascii', 'xmlcharrefreplace').strip() except: text = desc.text.strip() text = sanitize(text) d = XML('<div>' + text.encode('ascii', 'xmlcharrefreplace') + '</div>') bodydom.append(d) body = tostring(bodydom) cache.setData(body) return cache
def cleanup_html_preparer(value): if value is not colander.null: return htmllaundry.sanitize(value, cleaner=cleaners.DocumentCleaner, wrap=None) else: return colander.null
def crawl(): session = Session() musicians = session.query(Musician).all() for musician in musicians: musician_id = musician.id musician_url = musician.live_info_url resp = requests.get(musician_url).text try: soup = sanitize(BeautifulSoup(resp, "lxml").get_text()) except: soup = sanitize(BeautifulSoup(resp, "html5lib").get_text()) content = LiveInfo(musician_id=musician_id, content=soup, created_at=date.today()) session.add(content) session.commit() arrange_updates()
def convert_kml_to_page(self): for brain in self.portal_catalog(portal_type = 'File', path='iwlearn/osm'): obj = brain.getObject() data = obj.get_data() parent = obj.getParentNode() if callable(obj.id): obj_id = obj.id() else: obj_id = obj.id new_obj_id = obj_id.strip('kml') +'htm' try: self.portal_types.constructContent('Document', parent, new_obj_id) except: pass new_obj=parent[new_obj_id] if parent.id =='lmes': color='0000bf' elif parent.id =='rivers': color='56ffff' elif parent.id =='lakes': color='2c80d3' elif parent.id =='aquifers': color='c1742c' else: color ='00ff00' features = None try: features = extractfeatures_from_file(data) title = extract_title(data).strip('.kml') text = extract_description(data) #print features['MultiPolygon'] print title #print text except: print 'exception in %s' % brain.getId pass if new_obj.getText(): print 'skipping set text for %s' % brain.getId else: new_obj.setText(sanitize(text)) new_obj.setTitle(title) if features: style = IGeoCustomFeatureStyle(new_obj) style.geostyles.data['use_custom_styles']=True style.geostyles.data['polygoncolor']=color style.geostyles.update(style.geostyles) geo = IGeoManager(new_obj) if features['MultiPolygon']: shp = features['MultiPolygon'] q = shp.simplify(0.2).__geo_interface__ geo.setCoordinates(q['type'], q['coordinates'])
def sanitize_statement(statement): ''' Sanitize statement with MathML into Tex & minimul html''' soup = BeautifulSoup(statement, features="lxml") for item in soup.find_all('math'): new_tag = soup.new_tag('p') latex_string = convert_mathml2tex(str(item)) new_tag.string = f"\( {latex_string}\)" item.replace_with(new_tag) #for x in soup.find_all(): # if len(x.get_text(strip=True)) == 0: # x.extract() converted_equation = " ".join(str(soup).split()) return sanitize(converted_equation).strip()
def main (args): """ main functions """ logger = logging.getLogger(sys._getframe().f_code.co_name) soup=bs(open(args.infn)) beauty=soup.prettify(formatter='html') clean=sanitize(beauty) pure=re.sub('<br/>', '</p>\n<p>', clean) pure=re.sub('class="[^"]+"\s*', '', pure) pure=re.sub('style="[^"]+"\s*', '', pure) soup=bs(pure) f=codecs.open(args.outfn, 'w', encoding='utf-8') f.write(unicode(soup)) f.close()
def laundryHtml(html): """Clean using htmllaundry/lxml""" # docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html cleaner = cleaners.LaundryCleaner( allow_tags=getUserOption("keep_tags"), safe_attrs=getUserOption("keep_attrs"), processing_instructions=True, meta=True, scripts=True, comments=True, javascript=True, annoying_tags=True, page_structure=False, remove_unknown_tags=False, safe_attrs_only=False, add_nofollow=False, style=False, links=False, frames=False, ) return sanitize(html, cleaner)
def create_project(self, pinfo, gpid): ''' create a new project out of the data harvested from gefonline. unused attributes: ['', 'PDF A Amount', 'GEF Project Grant (CEO Appr.)', 'PDF-B (Supplemental-2) Approval Date', 'Project Cancellation Date', 'PPG Amount', 'Cofinancing Total (CEO Endo.)', 'GEF Agency Fees (CEO Endo.)', 'UNDP PMIS ID', 'Funding Source', 'PDF-C Approval Date', 'GEF Project Grant', , 'PIF Approval Date', 'Cofinancing Total (CEO Appr.)', 'PDF-A Approval Date', 'PPG Approval Date', 'PRIF Amount', , 'PDF-B Approval Date', 'GEF Project Grant (CEO Endo.)', 'GEF Agency Fees', 'PDF B Amount', 'PDF C Amount', 'CEO Endorsement Date', 'GEF Agency', 'Pipeline Entry Date', 'Cofinancing Total', 'PDF-B (Supplemental) Approval Date', 'Strategic Program', 'Project Cost (CEO Appr.)', 'GEF Agency Approval Date', 'GEF Agency Fees (CEO Appr.)', 'Project Cost (CEO Endo.)'] ''' portal_transforms = getToolByName(self, 'portal_transforms') portal_types = getToolByName(self, 'portal_types') wftool = getToolByName(self, 'portal_workflow') if gpid != int(pinfo['GEF Project ID'].strip()): return {'name': 'Error in GEF Project ID', 'url': ''} name = pinfo['Project Name'] url = self.context.absolute_url() + '/' + pinfo['GEF Project ID'] project_id = pinfo.get('GEF Project ID').strip() global_project = (pinfo.get('Region', '').find('Global') > -1) countries= harvest.get_countries(pinfo.get('Country','')) if 'Regional' in pinfo.get('Region', ''): project_scale = 'Regional' elif 'Global' in pinfo.get('Region', '') or 'Global' in pinfo.get('Country', ''): project_scale = 'Global' else: project_scale = 'National' project_status = pinfo.get('Project Status', None) try: start_date = DateTime(pinfo.get('Approval Date',None)) except: start_date = None if pinfo.has_key('Project Completion Date'): end_date = DateTime(pinfo.get('Project Completion Date')) else: end_date = None focal_area = pinfo.get('Focal Area', None) operational_program = harvest.split_semicolon( pinfo.get('Operational Program', '')) strategic_program = harvest.split_semicolon( pinfo.get('Strategic Program', '')) project_allocation = harvest.convert_currency_to_millions( pinfo.get('GEF Grant','0')) total_cost = harvest.convert_currency_to_millions( pinfo.get('Project Cost', '0')) wb_project_id = pinfo.get('IBRD PO ID', None) description = "" if pinfo.has_key('GEF Agency'): description += u"<h3>GEF Agency</h3> <p> %s </p>" % pinfo.get('GEF Agency') if pinfo.has_key('Executing Agency'): description += u"<h3>Executing Agency</h3> <p> %s </p>" % pinfo.get('Executing Agency') if pinfo.has_key('Description'): html = portal_transforms.convert( 'web_intelligent_plain_text_to_html', pinfo.get('Description')).getData() description += u"<hr/><br/> %s" % html.decode('utf-8', 'ignore') if pinfo.has_key('Implementation Status'): description += u"<h3>Implementation Status</h3> <p> %s </p>" % pinfo.get('Implementation Status') portal_types.constructContent('Project', self.context, project_id) new_project = getattr(self.context,project_id) new_project.update( title=name, gef_project_id=project_id, wb_project_id = wb_project_id, #globalproject=global_project, country=countries, project_status=project_status, start_date=start_date, end_date=end_date, focal_area=focal_area, operational_programme=operational_program, strategic_priority = strategic_program, gef_project_allocation=str(project_allocation), total_cost=str(total_cost), project_summary=sanitize(description), project_scale=project_scale, ) self._create_project_folders(new_project) wftool.doActionFor(new_project, 'submit') return {'name': name, 'url': url, 'description': description}
def sanitize_comment(s): return sanitize(s, cleaner=CommentCleaner)
def scrape_data_site(url): f = urllib.urlopen(url) s = f.read() f.close() soup = BeautifulSoup(s) table = soup.find(lambda tag: tag.name=='table' and tag.has_key('id') and tag['id']=="playertable_0") rows = table.findAll('tr') for j,tr in enumerate(rows): cols = tr.findAll('td'); for i,td in enumerate(cols): if (i==0): temp=td.text temp = re.sub('\*', '', temp) if (re.search(' III', temp)): n=re.sub(' III', ' ', temp) n1=StripMarkup(sanitize(n)) if (re.search('\xa0',n1)): n2=re.sub('\xa0',' ',n1) n3=n2.split(' ') print n3[0], n3[1]+','+n3[3]+",", sys.stdout.softspace=0 elif (re.search('PLAYER',temp)): pass else: li1=StripMarkup(sanitize(temp)) if (re.search('\xa0',li1)): li2=re.sub('\xa0',' ',li1) li3=li2.split(' ') if (position_counter==16): if ((week < 4)): print li3[0]+",", elif ((week==4) and (j>31)): print li3[0] elif ((week==4) and (j<=31)): print li3[0]+",", elif (((week==5) or (week==6)) and (j>29)): print li3[0] elif (((week==5) or (week==6)) and (j<=29)): print li3[0]+",", elif ((week==7) and (j>27)): print li3[0] elif ((week==7) and (j<=27)): print li3[0]+",", elif (((week==8) or (week==9)) and (j>29)): print li3[0] elif (((week==8) or (week==9)) and (j<=29)): print li3[0]+",", else: if (len(tr)==23): print li3[0], li3[1]+li3[2] else: print li3[0], li3[1]+li3[2]+',', sys.stdout.softspace=0 if (i==23): i23=td.text i23a=i23.split(' ') i23b=i23a[0] if (j==1): pass else: print StripMarkup(sanitize(i23b))
def pre_validate(self, form): c = htmllaundry.cleaners.DocumentCleaner c.allow_tags.append('div') self.data = htmllaundry.sanitize(self.data, cleaner=c, wrap=None)
def sanitize_html_content(html_str: str) -> str: return sanitize(html_str)
def clean_html(content): return sanitize(content, CustomCleaner)
def sanitize_kml_description(description): if description: desc = description[0].text #sanitize html snippet to avoid XSS return sanitize(desc)
def safe_body_html(text): """ Take raw html and sanitize for safe use with tal:content="structure:x" """ return sanitize(text,DocumentCleaner)
def htmlify(value): replacements = OrderedDict([('&', '&'), ('<', '<'), ('>', '>'), ('"', '"'), ('""', '"')]) for k, v in replacements.items(): value = value.replace(k, v) return sanitize(value, CommentCleaner)
# -*- coding: utf-8 -*- # 2018-09-23 # Limpia HTML generado por Word, para poder usarlo posteriormente en armado de plantilla html+campos formulario (gravity forms) # Repo: wichert / htmllaundry import io from htmllaundry import sanitize from htmllaundry.cleaners import CommentCleaner nameFile = "Contrato de arrendamiento v.2.html" fi = io.open(nameFile, mode="r", encoding="utf-8") fo = io.open(nameFile + '-output.html', mode="w", encoding="utf-8") #print LineCleaner #cleanedHtml = sanitize(fi.read()) cleanedHtml = sanitize(fi.read(), cleaner=CommentCleaner) fo.write(cleanedHtml) fo.close() fo.close() print cleanedHtmlx #programPause = raw_input("Press the <ENTER> key to continue...")