def parse(self, response):
     b = response.css('.templateContainer')
     page = response.url.split("/")[-1]
     filename = 'scraper/posto-%s.html' % page
     with open(filename, 'wb') as f:
         f.write(sanitize(b.get()).encode())
     self.log('Saved file %s' % filename)    
Ejemplo n.º 2
0
    def convert(self, data, cache, **kwargs):
        bodydom = Element('div')
        kmldom = XML(data)
        ns = kmldom.tag.strip('kml')
        placemarks = kmldom.findall('.//%sPlacemark' % ns)
        for placemark in placemarks:
            titles = placemark.findall(ns + 'name')
            for title in titles:
                t = Element('h2')
                t.text = title.text
                bodydom.append(t)

            descriptions = placemark.findall(ns + 'description')
            for desc in descriptions:
                if desc.text:
                    try:
                        text = desc.text.encode('ascii',
                                                'xmlcharrefreplace').strip()
                    except:
                        text = desc.text.strip()
                    text = sanitize(text)
                    d = XML('<div>' +
                            text.encode('ascii', 'xmlcharrefreplace') +
                            '</div>')
                    bodydom.append(d)

        body = tostring(bodydom)
        cache.setData(body)
        return cache
    def convert(self, data, cache, **kwargs):
        bodydom = Element('div')
        kmldom = XML(data)
        ns = kmldom.tag.strip('kml')
        placemarks = kmldom.findall('.//%sPlacemark' % ns)
        for placemark in placemarks:
            titles = placemark.findall(ns + 'name')
            for title in titles:
                t = Element('h2')
                t.text = title.text
                bodydom.append(t)

            descriptions = placemark.findall(ns+'description')
            for desc in descriptions:
                if desc.text:
                    try:
                        text = desc.text.encode('ascii', 'xmlcharrefreplace').strip()
                    except:
                        text = desc.text.strip()
                    text = sanitize(text)
                    d = XML('<div>' + text.encode('ascii', 'xmlcharrefreplace') + '</div>')
                    bodydom.append(d)

        body = tostring(bodydom)
        cache.setData(body)
        return cache
Ejemplo n.º 4
0
def cleanup_html_preparer(value):
    if value is not colander.null:
        return htmllaundry.sanitize(value,
                                    cleaner=cleaners.DocumentCleaner,
                                    wrap=None)
    else:
        return colander.null
Ejemplo n.º 5
0
def crawl():
    session = Session()
    musicians = session.query(Musician).all()

    for musician in musicians:
        musician_id = musician.id
        musician_url = musician.live_info_url

        resp = requests.get(musician_url).text
        try:
            soup = sanitize(BeautifulSoup(resp, "lxml").get_text())
        except:
            soup = sanitize(BeautifulSoup(resp, "html5lib").get_text())

        content = LiveInfo(musician_id=musician_id,
                           content=soup,
                           created_at=date.today())

        session.add(content)

    session.commit()
    arrange_updates()
Ejemplo n.º 6
0
def convert_kml_to_page(self):
    for brain in self.portal_catalog(portal_type = 'File', path='iwlearn/osm'):
        obj = brain.getObject()
        data = obj.get_data()
        parent = obj.getParentNode()
        if callable(obj.id):
            obj_id = obj.id()
        else:
            obj_id = obj.id
        new_obj_id = obj_id.strip('kml') +'htm'
        try:
            self.portal_types.constructContent('Document', parent, new_obj_id)
        except:
            pass
        new_obj=parent[new_obj_id]
        if parent.id =='lmes':
            color='0000bf'
        elif parent.id =='rivers':
            color='56ffff'
        elif parent.id =='lakes':
            color='2c80d3'
        elif parent.id =='aquifers':
            color='c1742c'
        else:
            color ='00ff00'
        features = None
        try:
            features = extractfeatures_from_file(data)
            title = extract_title(data).strip('.kml')
            text = extract_description(data)
            #print features['MultiPolygon']
            print title
            #print text
        except:
            print 'exception in %s' % brain.getId
            pass
        if new_obj.getText():
            print 'skipping set text for %s' % brain.getId
        else:
            new_obj.setText(sanitize(text))
            new_obj.setTitle(title)
        if features:
            style = IGeoCustomFeatureStyle(new_obj)
            style.geostyles.data['use_custom_styles']=True
            style.geostyles.data['polygoncolor']=color
            style.geostyles.update(style.geostyles)
            geo = IGeoManager(new_obj)
            if features['MultiPolygon']:
                shp = features['MultiPolygon']
                q = shp.simplify(0.2).__geo_interface__
                geo.setCoordinates(q['type'], q['coordinates'])
Ejemplo n.º 7
0
def sanitize_statement(statement):
    ''' Sanitize statement with MathML 
    into Tex & minimul html'''
    soup = BeautifulSoup(statement, features="lxml")
    for item in soup.find_all('math'):
        new_tag = soup.new_tag('p')
        latex_string = convert_mathml2tex(str(item))
        new_tag.string = f"\( {latex_string}\)"
        item.replace_with(new_tag)
    #for x in soup.find_all():
    #    if len(x.get_text(strip=True)) == 0:
    #        x.extract()
    converted_equation = " ".join(str(soup).split())
    return sanitize(converted_equation).strip()
Ejemplo n.º 8
0
def main (args):
    """
    main functions
    """
    logger = logging.getLogger(sys._getframe().f_code.co_name)

    soup=bs(open(args.infn))
    beauty=soup.prettify(formatter='html')
    clean=sanitize(beauty)
    pure=re.sub('<br/>', '</p>\n<p>', clean)
    pure=re.sub('class="[^"]+"\s*', '', pure)
    pure=re.sub('style="[^"]+"\s*', '', pure)
    soup=bs(pure)
    f=codecs.open(args.outfn, 'w', encoding='utf-8')
    f.write(unicode(soup))
    f.close()
Ejemplo n.º 9
0
def laundryHtml(html):
    """Clean using htmllaundry/lxml"""
    # docs: http://lxml.de/api/lxml.html.clean.Cleaner-class.html

    cleaner = cleaners.LaundryCleaner(
        allow_tags=getUserOption("keep_tags"),
        safe_attrs=getUserOption("keep_attrs"),
        processing_instructions=True,
        meta=True,
        scripts=True,
        comments=True,
        javascript=True,
        annoying_tags=True,
        page_structure=False,
        remove_unknown_tags=False,
        safe_attrs_only=False,
        add_nofollow=False,
        style=False,
        links=False,
        frames=False,
    )

    return sanitize(html, cleaner)
Ejemplo n.º 10
0
    def create_project(self, pinfo, gpid):
        '''
        create a new project out of the data harvested from gefonline.
        unused attributes:
    ['', 'PDF A Amount', 'GEF Project Grant (CEO Appr.)',
    'PDF-B (Supplemental-2) Approval Date', 'Project Cancellation Date',
    'PPG Amount', 'Cofinancing Total (CEO Endo.)',
    'GEF Agency Fees (CEO Endo.)', 'UNDP PMIS ID', 'Funding Source',
     'PDF-C Approval Date',
    'GEF Project Grant', ,
    'PIF Approval Date', 'Cofinancing Total (CEO Appr.)',
    'PDF-A Approval Date', 'PPG Approval Date', 'PRIF Amount', ,
    'PDF-B Approval Date',
    'GEF Project Grant (CEO Endo.)', 'GEF Agency Fees', 'PDF B Amount',
    'PDF C Amount', 'CEO Endorsement Date',
    'GEF Agency', 'Pipeline Entry Date', 'Cofinancing Total',
    'PDF-B (Supplemental) Approval Date',
    'Strategic Program', 'Project Cost (CEO Appr.)',
    'GEF Agency Approval Date', 'GEF Agency Fees (CEO Appr.)',
    'Project Cost (CEO Endo.)']
        '''
        portal_transforms = getToolByName(self, 'portal_transforms')
        portal_types = getToolByName(self, 'portal_types')
        wftool = getToolByName(self, 'portal_workflow')
        if gpid != int(pinfo['GEF Project ID'].strip()):
            return {'name': 'Error in GEF Project ID', 'url': ''}
        name = pinfo['Project Name']
        url = self.context.absolute_url() + '/' + pinfo['GEF Project ID']

        project_id = pinfo.get('GEF Project ID').strip()
        global_project = (pinfo.get('Region', '').find('Global') > -1)
        countries= harvest.get_countries(pinfo.get('Country',''))
        if 'Regional' in pinfo.get('Region', ''):
            project_scale = 'Regional'
        elif 'Global' in pinfo.get('Region', '') or 'Global' in pinfo.get('Country', ''):
             project_scale = 'Global'
        else:
            project_scale = 'National'
        project_status = pinfo.get('Project Status', None)
        try:
            start_date = DateTime(pinfo.get('Approval Date',None))
        except:
            start_date = None
        if pinfo.has_key('Project Completion Date'):
            end_date = DateTime(pinfo.get('Project Completion Date'))
        else:
            end_date = None
        focal_area = pinfo.get('Focal Area', None)
        operational_program = harvest.split_semicolon(
                    pinfo.get('Operational Program', ''))
        strategic_program = harvest.split_semicolon(
                    pinfo.get('Strategic Program', ''))
        project_allocation = harvest.convert_currency_to_millions(
                            pinfo.get('GEF Grant','0'))
        total_cost = harvest.convert_currency_to_millions(
                            pinfo.get('Project Cost', '0'))
        wb_project_id = pinfo.get('IBRD PO ID', None)
        description = ""
        if pinfo.has_key('GEF Agency'):
            description += u"<h3>GEF Agency</h3> <p> %s </p>" % pinfo.get('GEF Agency')
        if pinfo.has_key('Executing Agency'):
            description += u"<h3>Executing Agency</h3> <p> %s </p>" % pinfo.get('Executing Agency')
        if pinfo.has_key('Description'):
            html = portal_transforms.convert(
                'web_intelligent_plain_text_to_html',
                pinfo.get('Description')).getData()
            description += u"<hr/><br/> %s" % html.decode('utf-8', 'ignore')
        if pinfo.has_key('Implementation Status'):
            description += u"<h3>Implementation Status</h3> <p> %s </p>" % pinfo.get('Implementation Status')

        portal_types.constructContent('Project', self.context, project_id)

        new_project = getattr(self.context,project_id)


        new_project.update(
                        title=name,
                        gef_project_id=project_id,
                        wb_project_id = wb_project_id,
                        #globalproject=global_project,
                        country=countries,
                        project_status=project_status,
                        start_date=start_date,
                        end_date=end_date,
                        focal_area=focal_area,
                        operational_programme=operational_program,
                        strategic_priority = strategic_program,
                        gef_project_allocation=str(project_allocation),
                        total_cost=str(total_cost),
                        project_summary=sanitize(description),
                        project_scale=project_scale,
                       )

        self._create_project_folders(new_project)
        wftool.doActionFor(new_project, 'submit')
        return {'name': name, 'url': url, 'description': description}
Ejemplo n.º 11
0
def sanitize_comment(s):
    return sanitize(s, cleaner=CommentCleaner)
Ejemplo n.º 12
0
def scrape_data_site(url):
   f = urllib.urlopen(url)
   s = f.read()
   f.close()
   soup = BeautifulSoup(s)
   table = soup.find(lambda tag: tag.name=='table' and tag.has_key('id') and tag['id']=="playertable_0")
   rows = table.findAll('tr')
   
   for j,tr in enumerate(rows):
      
         
         
         cols = tr.findAll('td');
    
         for i,td in enumerate(cols):
           
              if (i==0):
                 
                 temp=td.text
                 temp = re.sub('\*', '', temp)
                 if (re.search(' III', temp)):
                    n=re.sub(' III', ' ', temp)
                    n1=StripMarkup(sanitize(n))
                    if (re.search('\xa0',n1)):
                       n2=re.sub('\xa0',' ',n1)
                    n3=n2.split(' ')
                    print n3[0], n3[1]+','+n3[3]+",",
                    sys.stdout.softspace=0                 
                
                         
                   
                   
                 elif (re.search('PLAYER',temp)):
                    pass
                                
                 else:
                    
                    li1=StripMarkup(sanitize(temp))
                    if (re.search('\xa0',li1)):
                       li2=re.sub('\xa0',' ',li1)
                       li3=li2.split(' ')
                       if (position_counter==16):
                          if ((week < 4)):
                                print li3[0]+",",
                          
                          elif ((week==4) and (j>31)):
                                print li3[0]
                          elif ((week==4) and (j<=31)):
                                print li3[0]+",",                             
                             
                                
                          elif (((week==5) or (week==6)) and (j>29)):
                                print li3[0]
                          
                          elif (((week==5) or (week==6)) and (j<=29)):
                                print li3[0]+",",
                          
                          elif ((week==7) and (j>27)):
                                print li3[0]
                             
                          elif ((week==7) and (j<=27)):
                                print li3[0]+",",
                                
                          elif (((week==8) or (week==9)) and (j>29)):
                                print li3[0]                           
                          elif (((week==8) or (week==9)) and (j<=29)):
                                print li3[0]+",",
                             
            
                       else:
                         
                          if (len(tr)==23): 
                             
                             print li3[0], li3[1]+li3[2]
                                                     
                          else:
                             print li3[0], li3[1]+li3[2]+',',                             
                             
                       sys.stdout.softspace=0
                       
                                             
                        
               
              if (i==23):
                   
                   i23=td.text
                   i23a=i23.split(' ')
                   i23b=i23a[0]
                   if (j==1):
                      pass
                  
                  
                   else:
                      print StripMarkup(sanitize(i23b))
Ejemplo n.º 13
0
 def pre_validate(self, form):
     c = htmllaundry.cleaners.DocumentCleaner
     c.allow_tags.append('div')
     self.data = htmllaundry.sanitize(self.data, cleaner=c, wrap=None)
Ejemplo n.º 14
0
def sanitize_html_content(html_str: str) -> str:
    return sanitize(html_str)
Ejemplo n.º 15
0
def clean_html(content):
    return sanitize(content, CustomCleaner)
Ejemplo n.º 16
0
def sanitize_kml_description(description):
    if description:
        desc = description[0].text
        #sanitize html snippet to avoid XSS
        return sanitize(desc)
Ejemplo n.º 17
0
def sanitize_kml_description(description):
    if description:
        desc = description[0].text
        #sanitize html snippet to avoid XSS
        return sanitize(desc)
Ejemplo n.º 18
0
def safe_body_html(text):
    """
    Take raw html and sanitize for safe use with tal:content="structure:x"
    """
    return sanitize(text,DocumentCleaner)
Ejemplo n.º 19
0
def htmlify(value):
    replacements = OrderedDict([('&amp;', '&'), ('&lt;', '<'), ('&gt;', '>'),
                                ('&quot;', '"'), ('""', '"')])
    for k, v in replacements.items():
        value = value.replace(k, v)
    return sanitize(value, CommentCleaner)
Ejemplo n.º 20
0
# -*- coding: utf-8 -*-
# 2018-09-23
# Limpia HTML generado por Word, para poder usarlo posteriormente en armado de plantilla html+campos formulario (gravity forms)
# Repo: wichert / htmllaundry
import io
from htmllaundry import sanitize
from htmllaundry.cleaners import CommentCleaner

nameFile = "Contrato de arrendamiento v.2.html"
fi = io.open(nameFile, mode="r", encoding="utf-8")
fo = io.open(nameFile + '-output.html', mode="w", encoding="utf-8")

#print LineCleaner

#cleanedHtml = sanitize(fi.read())
cleanedHtml = sanitize(fi.read(), cleaner=CommentCleaner)
fo.write(cleanedHtml)

fo.close()
fo.close()

print cleanedHtmlx
#programPause = raw_input("Press the <ENTER> key to continue...")
Ejemplo n.º 21
-1
 def pre_validate(self, form):
     c = htmllaundry.cleaners.DocumentCleaner
     c.allow_tags.append('div')
     self.data = htmllaundry.sanitize(self.data, cleaner=c, wrap=None)