コード例 #1
0
ファイル: DocsTest.py プロジェクト: 18600597055/hue
 def test_doc_with_entity(self):
     h = "é"
     expected = DOC % "é"
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
     
     expected = DOC % "é"
     doc, err = tidy_document(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)
コード例 #2
0
ファイル: helpers.py プロジェクト: VitorVRS/kuma
def _massage_diff_content(content):
    tidy_options = {
        'output-xhtml': 0,
        'force-output': 1,
    }
    try:
        content = tidy_document(content, options=tidy_options)
    except UnicodeDecodeError:
        # In case something happens in pytidylib we'll try again with
        # a proper encoding
        content = tidy_document(content.encode('utf-8'), options=tidy_options)
        tidied, errors = content
        content = tidied.decode('utf-8'), errors
    return content
コード例 #3
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
コード例 #4
0
ファイル: parser.py プロジェクト: mhagander/pgarchivesweb
    def html_clean(self, html):
        # First we pass it through tidy
        (html, errors) = tidylib.tidy_document(html,
                                               options={
                                                   'drop-proprietary-attributes': 1,
                                                   'alt-text': '',
                                                   'hide-comments': 1,
                                                   'output-xhtml': 1,
                                                   'show-body-only': 1,
                                                   'clean': 1,
                                                   'char-encoding': 'utf8',
                                                   'show-warnings': 0,
                                                   'show-info': 0,
                                               })
        if errors:
            print(("HTML tidy failed for %s!" % self.msgid))
            print(errors)
            return None

        try:
            cleaner = HTMLCleaner()
            cleaner.feed(html)
            return cleaner.get_text()
        except Exception as e:
            # Failed to parse the html, thus failed to clean it. so we must
            # give up...
            return None
コード例 #5
0
ファイル: sinads.py プロジェクト: B-Rich/dealer
 def __trading_years(self, instrument):
     re = urllib2.urlopen('http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_MarketHistory/stockid/%s.phtml' % (instrument))
     document, errors = tidy_document(re.read())
     soup = BeautifulSoup(document)
     node = soup.find('select', attrs={'name':'year'})
     for option in node.findAll('option'):
         yield option.getText()
コード例 #6
0
    def process_response(self, request, response):
        if 'text/html' in response['Content-Type'] and response.content:
            document, errors = tidy_document(response.content)
            if errors:
                raise HTMLValidationError(errors)

        return response
コード例 #7
0
ファイル: getMenu.py プロジェクト: blueskywalker/menuReview
def getMenu():
    storeFile = open("list.txt","r")
    txt = storeFile.read()
    storeFile.close()
    
    list=txt.split('\n\n\n')
    

 #   print list
    
    for store in list:    
#        print store
        rest = store.split('\n')
        if len(rest)!=3:
            break
        try:
            url=baseUrl+rest[2] +'menu'
            print url
            res=urlopen(url)
            html=res.read()    
         
            options = {'output-encoding':'utf8', 'output-xhtml':1 }
            document,errors = tidy_document(html,options)   
            
            filepath = dataDir+ (rest[2].split('/'))[2] + ".html"
            saveFile = open(filepath,"w")
            saveFile.write(document)
            saveFile.close()
            print filepath
        except :
            print "skip:"+url
コード例 #8
0
ファイル: htmlutils.py プロジェクト: kollivier/brightwriter
def cleanUpHTML(html, options=None):
    import tidylib
    tidylib.BASE_OPTIONS = {}

    default_options = { 
                        "force-output" : 1,
                        "output-xhtml" : 1,
                        "doctype" : "strict",
                        "drop-empty-paras": 1,
                        "output-encoding" : "utf8",
                        "clean": 1,
                        "bare": 1
                       }
    if options:
        default_options.extend(options)

    # first fix up footnotes so that HTMLTidy won't ditch them
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    footnoteFixer(soup) #html)
    stripEmptyParagraphs(soup)
    
    html, errors = tidylib.tidy_document(soup.prettify(encoding=None), options=default_options)
    
    soup = BeautifulSoup.BeautifulSoup(html, smartQuotesTo="html")
    addMetaTag(soup, [('http-equiv', 'Content-type'), ('content', 'text/html; charset=utf-8')])
    
    return soup.prettify(encoding=None), errors
コード例 #9
0
def _tidy2xhtml5(html):
    """Tidy up a html4/5 soup to a parsable valid XHTML5.
    Requires tidy-html5 from https://github.com/w3c/tidy-html5 Installation: http://goo.gl/FG27n
    """
    html = _io2string(html)
    html = _pre_tidy(html) # Pre-process
    xhtml5, errors = tidy_document(html,
        options={
            'merge-divs': 0,       # do not merge nested div elements - preserve semantic block structrues
            'output-xml': 1,       # create xml output
            'indent': 0,           # Don't use indent, add's extra linespace or linefeeds which are big problems
            'tidy-mark': 0,        # No tidy meta tag in output
            'wrap': 0,             # No wrapping
            'alt-text': '',        # Help ensure validation
            'doctype': 'strict',   # Little sense in transitional for tool-generated markup...
            'force-output': 1,     # May not get what you expect but you will get something
            'numeric-entities': 1, # remove HTML entities like e.g. nbsp
            'clean': 1,            # remove
            'bare': 1,
            'word-2000': 1,
            'drop-proprietary-attributes': 1,
            'enclose-text': 1,     # enclose text in body always with <p>...</p>
            'logical-emphasis': 1, # transforms <i> and <b> text to <em> and <strong> text
            # do not tidy all MathML elements! List of MathML 3.0 elements from http://www.w3.org/TR/MathML3/appendixi.html#index.elem
            'new-inline-tags': 'abs, and, annotation, annotation-xml, apply, approx, arccos, arccosh, arccot, arccoth, arccsc, arccsch, arcsec, arcsech, arcsin, arcsinh, arctan, arctanh, arg, bind, bvar, card, cartesianproduct, cbytes, ceiling, cerror, ci, cn, codomain, complexes, compose, condition, conjugate, cos, cosh, cot, coth, cs, csc, csch, csymbol, curl, declare, degree, determinant, diff, divergence, divide, domain, domainofapplication, el, emptyset, eq, equivalent, eulergamma, exists, exp, exponentiale, factorial, factorof, false, floor, fn, forall, gcd, geq, grad, gt, ident, image, imaginary, imaginaryi, implies, in, infinity, int, integers, intersect, interval, inverse, lambda, laplacian, lcm, leq, limit, list, ln, log, logbase, lowlimit, lt, maction, malign, maligngroup, malignmark, malignscope, math, matrix, matrixrow, max, mean, median, menclose, merror, mfenced, mfrac, mfraction, mglyph, mi, min, minus, mlabeledtr, mlongdiv, mmultiscripts, mn, mo, mode, moment, momentabout, mover, mpadded, mphantom, mprescripts, mroot, mrow, ms, mscarries, mscarry, msgroup, msline, mspace, msqrt, msrow, mstack, mstyle, msub, msubsup, msup, mtable, mtd, mtext, mtr, munder, munderover, naturalnumbers, neq, none, not, notanumber, note, notin, notprsubset, notsubset, or, otherwise, outerproduct, partialdiff, pi, piece, piecewise, plus, power, primes, product, prsubset, quotient, rationals, real, reals, reln, rem, root, scalarproduct, sdev, sec, sech, selector, semantics, sep, set, setdiff, share, sin, sinh, subset, sum, tan, tanh, tendsto, times, transpose, true, union, uplimit, variance, vector, vectorproduct, xor',
            'doctype': 'html5',
            })

    #return xhtml5
    # return the tree itself, there is another modification below to avoid
    # another parse
    return _post_tidy(xhtml5)
コード例 #10
0
    def process_response(self, request, response, spider):
        if 'index-mo' in response.url:
            return response
        if 'index-do' in response.url:
            return response
        if 'index-e_types' in response.url:
            return response

        body = response.body
        
        # VERY UGLY...  need to get this done, so it's ugly for now...
        index = body.find("<table")
        while index != -1:
            inner = body.find("<table", index+6)
            endtable = body.find("</table", index+6)
            if inner != -1 and inner < endtable:
                # we have an inner table...
                if body.find("<tr", inner, endtable) != -1:
                    break  # if it's truely a table
                    
                else:
                    start = inner-1
                    end = body.find(">", inner)+1
                    body = body[:start] + body[end:]
                    
                    endtable = body.find("</table", index+6)
                    
                    start = endtable-1
                    end = body.find(">", endtable)+1
                    body = body[:start] + body[end:]
                    
            index = body.find("<table", index+6)
            
        # remove any <br> before we tidy it up
        body = self.br_re.sub('', body)
        body = self.empty_anchor_re.sub('', body)
        
        tidylib.BASE_OPTIONS = {
            "output-xhtml": 0,     # XHTML instead of HTML4
            "indent": 1,           # Pretty; not too much of a performance hit
            "tidy-mark": 0,        # No tidy meta tag in output
            "wrap": 0,             # No wrapping
            "alt-text": "",        # Help ensure validation
            "doctype": 'omit',     # Little sense in transitional for tool-generated markup...
            "force-output": 1,     # May not get what you expect but you will get something
        }
        body, _ = tidy_document(body, options={'drop-empty-paras':1,
                    'drop-font-tags':1,'enclose-text':1,'merge-divs':1,'fix-bad-comments':1})
            
        body = self.link_re.sub('\g<1>', body)
        body = self.vmodl_re.sub('\g<1>', body)
        body = self.strong_re.sub('\g<1>', body)
        body = self.script_re.sub('', body)
        body = self.ul_re.sub('', body)
        body = self.li_end_re.sub('', body)
        body = self.li_re.sub('* ', body)
        body = self.ul_end_re.sub('', body)
        
        response = response.replace(body=body)
        return response
コード例 #11
0
ファイル: test_docs.py プロジェクト: GertBurger/pytidylib
 def test_xmlns_large_document_xml_corner_case(self):
     # Test for a super weird edge case in Tidy that can cause it to return
     # the wrong required buffer size.
     body = '<span><span>A</span></span>' + 'A' * 7937
     html = '<html xmlns="http://www.w3.org/1999/xhtml">' + body
     doc, err = tidy_document(html, {'output-xml': 1})
     self.assertEqual(doc.strip()[-7:], "</html>")
コード例 #12
0
ファイル: DocsTest.py プロジェクト: 18600597055/hue
 def test_doc_with_unclosed_tag(self):
     h = "<p>hello"
     expected = DOC % '''<p>
   hello
 </p>'''
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
コード例 #13
0
def call():
    if world.results:
        return

    data = urllib.urlencode(world.params)
    req = urllib2.Request(url="%s/%s?%s" % (world.base_url, world.requesttype, data),
                          headers=world.header)
    fd = urllib2.urlopen(req)
    page = fd.read()

    fmt = world.params.get('format')
    if fmt not in ('html', 'xml', 'json', 'jsonv2'):
        fmt = 'xml' if world.requesttype == 'reverse' else 'html'
    pageinfo = fd.info()
    assert_equal('utf-8', pageinfo.getparam('charset').lower())
    pagetype = pageinfo.gettype()
    if fmt == 'html':
        assert_equals('text/html', pagetype)
        document, errors = tidy_document(page, 
                             options={'char-encoding' : 'utf8'})
        assert(len(errors) == 0), "Errors found in HTML document:\n%s" % errors
        world.results = document
    elif fmt == 'xml':
        assert_equals('text/xml', pagetype)
        world.results = parseString(page).documentElement
    else:
        if 'json_callback' in world.params:
            func = world.params['json_callback']
            assert page.startswith(func + '(')
            assert page.endswith(')')
            page = page[(len(func)+1):-1]
            assert_equals('application/javascript', pagetype)
        else:
            assert_equals('application/json', pagetype)
        world.results = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(page)
コード例 #14
0
    def run(self):
        """ Main entry point
        
        Run FuncInventory and if there is a change, prepare and send an email.
        """
        # Run the inventory
        inventory = func_inventory.FuncInventory()
        inventory.run(
            [
                'func-inventory',
                '--tree=%s' % self.config['git_repo'],
                '--modules=%s' % ','.join(self.config['modules'])
            ])

        diff = self.git_diff()
        if not diff:
            self.log('No changes detected.  Sleeping.')
        else:
            self.log('CHANGE DETECTED in func-inventory.')

            kw = dict(dark_bg=self.config['dark_bg'],
                      font_size=self.config['font_size'])
            html = ansi2html.Ansi2HTMLConverter(**kw).convert(diff)

            html, errors = tidylib.tidy_document(html)

            html = pypremailer.Premailer(html).premail()

            self.mail(html)

            self.log('Done mailing changes.')
コード例 #15
0
   def marklogic_put_xml(self, item, spider_name):
       # Set the uri and collection
       if (self.ml_transform == ''):
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name}
       else:
           params = {'uri': item['uri'], 'collection': self.ml_collections or spider_name, 'transform': self.ml_transform}
       # Set up the XML payload
       payload = dicttoxml(dict(item), attr_type=False, custom_root='webcontent')
       # Decode the <> characters back again
       payload = payload.replace('&lt;', '<').replace('&gt;', '>').replace('&apos;', "'").replace('&quot;', '"')
       # Run tidy in order to get wel-formed XML
       payload, errors = tidy_document(payload, options={'input-xml': 1})
 
       # Set up the header
       headers = {'Content-Type': 'application/xml'}
 
       ml_uri = ('ml_uri' in item and item['ml_uri']) or self.ml_uri
       logging.info("PUTting XML in " + ml_uri + " as " + item['uri'])
 
       # Call the MarkLogic REST endpoint
       ml_user = ('ml_user' in item and item['ml_user']) or self.ml_user
       ml_pwd = ('ml_pwd' in item and item['ml_pwd']) or self.ml_pwd
       r = requests.put(ml_uri,
           params = params,
           auth = HTTPDigestAuth(ml_user, ml_pwd),
           data = payload,
           headers = headers)
 
       logging.info("PUT response: " + str(r.status_code) + ", " + r.text)
コード例 #16
0
 def dynamic_test_method(self):
     """this function name doesn't matter much, it can start with `test`,
     but we're going to rename it dynamically below"""
     reportURLstring = '/report?reportname=' + reportItem.metadata['action']
     response=self._my_app.get(reportURLstring)
     code, error=tidylib.tidy_document(response.body, options={'show-errors':1, 'show-warnings':0})
     self.assertFalse(error, '%s did not return valid html page' % reportURLstring)
コード例 #17
0
ファイル: enml.py プロジェクト: shurain/archiver
def html2enml(html):
    # doc, err = tidy_fragment(

    doc, err = tidy_document(
        html,
        options={
            "output-xhtml": 1,
            "drop-proprietary-attributes": 1,
            "merge-divs": 1,
            "clean": 1
        }
    )

    root = fromstring(doc)

    # XXX dirty hack to circumvent a bug in lxml parser
    root = fromstring(etree.tostring(root))

    logging.debug(etree.tostring(root))

    # tidy_document returns a valid html document which means it usually contains html tag and proper body element
    root = root.find('body')
    if root is None:
        logging.warn("No body on this document")
        logging.warn(html)
        return "<div></div>"
    root.tag = 'div'

    root = remove_prohibited_elements(root)
    root = remove_prohibited_attributes(root)
    #FIXME Skipping dtd validation because of slow DTD creation speed
    # validate_dtd(html, f):

    return etree.tostring(root)
コード例 #18
0
    def process_response(self, request, response):
        if not _has_tidylib or not self._is_html(request, response):
            return response

        html, errors = tidylib.tidy_document(response.content, self._options, keep_doc=True)
        if not errors:
            return response

        # Filter out what we care about
        err_list = errors.rstrip().split("\n")
        err_list = self._filter_warnings(err_list)
        if not err_list:
            return response

        try:
            fn = urlresolvers.resolve(request.path)[0]
            fn_name = "%s.%s" % (fn.__module__, fn.__name__)
        except:
            fn_name = "<unresolved_url>"

        # Write the two versions of html out for offline debugging
        filename = os.path.join(self._outdir, fn_name)

        result = (
            "HTML tidy result: %s [%s]:"
            "\n\t%s"
            "\nPlease see %s.orig %s.tidy\n-------" % (request.path, fn_name, "\n\t".join(err_list), filename, filename)
        )

        file(filename + ".orig", "w").write(i18n.smart_str(response.content))
        file(filename + ".tidy", "w").write(i18n.smart_str(html))
        file(filename + ".info", "w").write(i18n.smart_str(result))

        self._logger.error(result)
        return response
コード例 #19
0
ファイル: bvb.py プロジェクト: orithena/sportswarnbot
def fetch_data():
    def bvbreplace(s):
        return "BVB" if "Dortmund" in s else s

    doc = None
    try:
        doc, errs = tidy_document(urllib2.urlopen('http://www.bvb.de/').read(), tidyoptions)
        soup = Soup(doc)
    except Exception as e:
        raise Exception(u"Error fetching/parsing website: %s" % e)

    out = ''
    matchtime = datetime.datetime.now() + datetime.timedelta(hours=25)
    timestr = ''
    try:
        home = bvbreplace(select(soup, "div.next-match p span")[0].contents[0].strip())
        guest = bvbreplace(select(soup, "div.next-match p span")[1].contents[0].strip())
        league = ''
        try:
            league = select(soup, "div.next-match p span.tournament")[0].contents[0].strip()
        except:
            league = select(soup, "div.next-match p span")[2].contents[0].strip()            
        matchtime = datetime.datetime.strptime(select(soup, "div.next-match p")[1].contents[-1].strip(), u"%d.%m.%Y %H:%M")
        timestr = matchtime.strftime(u"%a, %d.%m.%Y %H:%M")
        dontgo = u"U42/U46/Kreuzviertel/Borsigplatz/Uni-Parkplatz" if u"BVB" == home else u"Kneipen mit TV in Dortmund"
        location = u"Heim" if u"BVB" == home else u"Auswaerts"
        out = u"WARNUNG! %s: %s vs %s (%s/%s). Meide %s." % (timestr, home, guest, location, league, dontgo)
    except IndexError:
        # This means: No next game on the webpage.
        sys.exit(1)
    except Exception as e:
        #print(traceback.format_exc())
        raise Exception(u"ERRBVB while parsing bvb.de: %s" % e)
    return out, matchtime
コード例 #20
0
def nofoutofplacefeatures(url):
	try:
	

	#	pdb.set_trace()

		if url[:4]=="http":
			r = requests.get(url)
		else:
			url="http://"+url
			r  = requests.get(url)

		#r = requests.get(url)
		data = r.text
		data2=r.content

		document, errors = tidy_document(data,
		  options={'numeric-entities':1})

		#print document
		#print errors
		#print "Number of Elements Out of Place : " + str(len(errors))
		return len(errors)
	except:
		pass
コード例 #21
0
def get_employees(lastname, firstname):
    payload = { 'find' : lastname }
    res = requests.get('https://www.campus.rwth-aachen.de/rwth/all/lecturerlist.asp', params=payload)
    if res.status_code == 200:
        persons = [ ]
        
        document, errors = tidy_document(res.content, options={'numeric-entities': 1, 'output_xhtml': 1})
        tree = ET.fromstring(strip_ns(document))
        
        try:
            filename = posixpath.basename(urlparse.urlsplit(res.url).path)
            if filename == 'lecturer.asp':
                fullname = tree.find('body/table[1]/tr[3]//tr[2]/td[2]').text.strip()
                unit = tree.find("body/table[2]//td[@class='h3']/a").text.strip()
            
                persons.append(fullname)

            elif filename == 'lecturerlist.asp':
                links = [ ]
                for cell in tree.findall('body/table[2]//td[3]/table[2]//td[1]/a'):
                    if cell is not None:
                        fullname = cell.text.strip()
                        persons.append(fullname)
            else:
                raise Exception
        except:
            print "===> WARNING: failed to get employee list for: %s, %s" % (firstname, lastname)
        
        return persons
コード例 #22
0
ファイル: notilitus.py プロジェクト: arychj/Notilitus
def sanitize(note):
	debug('Sanitizing note content...', 2)

	if get_setting('evernote/sanitize/@applytemplate') == 'True':
		with open(get_setting('evernote/sanitize/template/text()'), 'r') as file:
			template = file.read()
			template = template.replace('{content}', note['content'])
			
		note['content'] = transform(template)
		
		preservedElements = []
		preservePattern = get_setting('evernote/sanitize/preserve/pattern/text()')
		preserves = get_setting('evernote/sanitize/preserve/elements/text()').split(',')
		for preserve in preserves:
			matches = re.findall(preservePattern.format(preserve), note['content'])
			for match in matches:
				placeholder = '{%s}' % uuid.uuid4().hex
				preservedElements.append({'placeholder': placeholder, 'element': match})
				note['content'] = note['content'].replace(match, placeholder, 1)
	
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/empty/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/attributes/prohibited/text()'), '', note['content'])
		note['content'] = re.sub(get_setting('evernote/sanitize/elements/text()'), '', note['content'])
		note['content'] = note['content'].encode('utf-8', errors='ignore')
		(note['content'], errors) = tidy_document(note['content'])

		for element in preservedElements:
			note['content'] = note['content'].replace(element['placeholder'], element['element'])
	
	if note['title'] != None:
		note['title'] = note['title'].replace('\n', ' ').replace('\r', '').replace('  ', ' ')
	else:
		note['title'] = get_setting('evernote/sanitize/defaulttitle/text()')
コード例 #23
0
ファイル: main.py プロジェクト: bradmontgomery/word2html
def convert_to_html(filename):
    # Do the conversion with pandoc
    output = pypandoc.convert(filename, 'html')

    # Clean up with tidy...
    output, errors = tidy_document(output,  options={
        'numeric-entities': 1,
        'wrap': 80,
    })
    print(errors)

    # replace smart quotes.
    output = output.replace(u"\u2018", '&lsquo;').replace(u"\u2019", '&rsquo;')
    output = output.replace(u"\u201c", "&ldquo;").replace(u"\u201d", "&rdquo;")

    # write the output
    filename, ext = os.path.splitext(filename)
    filename = "{0}.html".format(filename)
    with open(filename, 'w') as f:
        # Python 2 "fix". If this isn't a string, encode it.
        if type(output) is not str:
            output = output.encode('utf-8')
        f.write(output)

    print("Done! Output written to: {}\n".format(filename))
コード例 #24
0
ファイル: homepage.py プロジェクト: adam139/xsgs.theme
    def _tidysrc(self,data,srccode):
        """tidy scribe the html src"""

        try:
            from tidylib import tidy_document
            BASE_OPTIONS = {
    "output-xhtml": 1,     # XHTML instead of HTML4
    "indent": 1,           # Pretty; not too much of a performance hit
    "tidy-mark": 0,        # No tidy meta tag in output
    "wrap": 0,             # No wrapping
    "alt-text": "",        # Help ensure validation
    "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
    "force-output": 1,     # May not get what you expect but you will get something
    "char-encoding":'utf-8',
    "input-encoding":srccode,
    "output-encoding":'utf-8',
    }
            if not isinstance(data, unicode):                
                try:
                    data = data.decode(srccode)
                except:
                    pass
            doc, errors = tidy_document(data,options={'numeric-entities':1})
            return doc
        except:
            return data
コード例 #25
0
ファイル: DocsTest.py プロジェクト: 18600597055/hue
 def test_doc_with_unicode_subclass(self):
     class MyUnicode(unicode):
         pass
     
     h = MyUnicode(u"unicode string ß")
     expected = unicode(DOC, 'utf-8') % h
     doc, err = tidy_document(h)
     self.assertEqual(doc, expected)
コード例 #26
0
ファイル: templates.py プロジェクト: edmw/scripts
 def pretty(self, html):
     soup = BeautifulSoup(html, "html5lib")
     document, errors = tidy_document(soup.encode(formatter="html"), options={
         'char-encoding': 'utf8',
         'output-encoding': 'utf8',
         'doctype': 'html5'
     })
     return document
コード例 #27
0
 def init_stats(self):
     self.source = self.response.content.decode('utf-8')
     if tidylib:
         document, errors = tidylib.tidy_document(self.source, options={'numeric-entities':1})
     else:
         errors = ""
     if errors:
         self.errors = [e.groupdict() for e in error_re.finditer(errors)]
コード例 #28
0
ファイル: DocsTest.py プロジェクト: waylan/pytidylib
 def test_doc_with_unicode_subclass(self):
     class MyUnicode(utype):
         pass
     
     h = MyUnicode("unicode string ß")
     expected = DOC % h
     doc, err = tidy_document(h, {'output_xhtml':1})
     self.assertEqual(doc, expected)
コード例 #29
0
ファイル: transform.py プロジェクト: koslab/platocdp.devoops
 def tidy(self, data):
     document, errors = tidy_document(data, {
         'input-xml': True, 'output-xml': True,
         'preserve-entities': True, 'numeric-entities': True
     })
     if errors:
         print errors
     return document
コード例 #30
0
    def process_response(self, request, response):
        document, errors = tidy_document(response.content, options={"numeric-entities": 1})
        self.log_data = (document, errors)
        self.src_content = response.content
        errors_list = errors.split("\n")
        self.errors_count = len([err for err in errors_list if "error:" in err.lower()])
        self.warns_count = len([err for err in errors_list if "warning:" in err.lower()])

        return response
コード例 #31
0
def validate_html(endpoint, document):
    """
    This function can be used to make sure HTML returned is valid
    It raises an exception describing what's wrong then non-valid HTML was entered
    :param endpoint: name of the function which returned the html content
    :param document: the html content
    :return: None
    """
    tidied, errors = tidy_document(document)
    if errors:
        raise SystemError(
            "Errors were found in the following HTML returned by function {}:\n{}\n\nErrors:\n{}"
            .format(endpoint, document, errors))
コード例 #32
0
 def pretty(self, tidy_warnings=False) -> str:
     '''Like render() but format through tidylib'''
     txt, errors = tidy_document(
         self.render(), {
             'indent': 1,
             'output-xhtml': True,
             'force-output': 1,
             'doctype': 'strict',
             'show-warnings': tidy_warnings
         })
     if errors:
         print('HTML tidy: ' + str(errors), file=sys.stderr)
     return txt
コード例 #33
0
 def test_post_note(self):
     code, error = tidylib.tidy_document(
         self._my_class.post_note(**{
             "author": "test",
             "message": "test"
         }),
         options={
             "show-errors": 1,
             "show-warnings": 0
         },
     )
     self.assertFalse(error,
                      "/notes/post_note does not return valid html page")
コード例 #34
0
 def test_select_item_search(self):
     random_item = random.sample(list(Book.selectBy(status="STOCK")), 1)[0]
     code, error = tidylib.tidy_document(
         self._my_class.select_item_search(
             title=random_item.title.booktitle),
         options={
             "show-errors": 1,
             "show-warnings": 0
         },
     )
     self.assertFalse(
         error,
         "/register/select_item_search does not return valid html page")
コード例 #35
0
 def clean_html(self, htmlfile):
     try:
         reader = open(htmlfile, 'r')
         content = reader.read()
         reader.close()
         document, errors = tidy.tidy_document(content, options=tidy_options)
         if document:
             writer = open(htmlfile, 'w')
             writer.write(document)
             writer.close()
         print "Cleaned", htmlfile
     except Exception, e:
         print e
コード例 #36
0
def get_cvk_page(url):
    '''Повертає текст сторінки або нічого
    '''
    res = s.get(url, headers=headers, verify=False)
    # print(res.encoding)
    # print(res.url)
    res.encoding = "utf-8"
    if res.status_code != 200:
        print(f"Error <= {url}")
        return
    tidy, errors = tidy_document(res.text)
    # print(errors)
    return tidy
コード例 #37
0
 def test_select_item_for_isbn_search_functional(self):
     random_item = random.sample(list(Title.select()), 1)[0]
     response = self._my_app.get("/admin/select_item_for_isbn_search",
                                 {"isbn": random_item.isbn})
     code, error = tidylib.tidy_document(response.body,
                                         options={
                                             "show-errors": 1,
                                             "show-warnings": 0
                                         })
     self.assertFalse(
         error,
         "/admin/select_item_for_isbn_search did not return proper response"
     )
コード例 #38
0
    def parse_html(self):
        content, errors = tidy_document(self.page,
                                        options={'char-encoding': 'utf8'})
        #eq_(len(errors), 0 , "Errors found in HTML document:\n%s" % errors)

        b = content.find('nominatim_results =')
        e = content.find('</script>')
        content = content[b:e]
        b = content.find('[')
        e = content.rfind(']')

        self.result = json.JSONDecoder(object_pairs_hook=OrderedDict).decode(
            content[b:e + 1])
コード例 #39
0
ファイル: parserLattes.py プロジェクト: Webteg/crawlerlattes
    def __init__(self, idMembro, cvLattesHTML):
        HTMLParser.__init__(self)

        # inicializacao obrigatoria
        self.idMembro = idMembro
        self.sexo = 'Masculino'
        self.nomeCompleto = u'[Nome-nao-identificado]'

        self.item = ''
        self.issn = ''
        self.listaIDLattesColaboradores = []
        self.listaFormacaoAcademica = []
        self.listaAreaDeAtuacao = []
        self.listaIdioma = []

        self.listaArtigoEmPeriodico = []
        self.listaLivroPublicado = []
        self.listaCapituloDeLivroPublicado = []
        self.listaTextoEmJornalDeNoticia = []
        self.listaTrabalhoCompletoEmCongresso = []
        self.listaResumoExpandidoEmCongresso = []
        self.listaResumoEmCongresso = []
        self.listaArtigoAceito = []
        self.listaApresentacaoDeTrabalho = []
        self.listaOutroTipoDeProducaoBibliografica = []


        self.listaParticipacaoEmEvento = []
        self.listaOrganizacaoDeEvento = []


        # inicializacao para evitar a busca exaustiva de algumas palavras-chave
        self.salvarAtualizacaoCV = 1
        self.salvarFoto = 1
        self.procurarCabecalho = 0
        self.achouGrupo = 0
        self.doi = ''
        self.relevante = 0
        self.idOrientando = ''
        self.complemento = ''

        # contornamos alguns erros do HTML da Plataforma Lattes
        cvLattesHTML = cvLattesHTML.replace("<![CDATA[","")
        cvLattesHTML = cvLattesHTML.replace("]]>","")
        cvLattesHTML = cvLattesHTML.replace("<x<","&lt;x&lt;")
        cvLattesHTML = cvLattesHTML.replace("<X<","&lt;X&lt;")

        # feed it!
        cvLattesHTML, errors = tidy_document(cvLattesHTML, options={'numeric-entities':1})

        self.feed(cvLattesHTML)
コード例 #40
0
    def _validateHTML(self, moFile):
        """
        This function validates the file ``moFile`` for correct html syntax.

        :param moFile: The name of a Modelica source file.
        :return: (str, str) The tidied markup [0] and warning/error
                 messages[1]. Warnings and errors are returned
                 just as tidylib returns them.

        """
        from tidylib import tidy_document

        entries = self._getInfoRevisionsHTML(moFile)

        # Document header
        header = "<?xml version='1.0' encoding='utf-8'?> \n \
        <!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \n \
    \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"> \n \
<html xmlns=\"http://www.w3.org/1999/xhtml\"> \n \
<head> \n \
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" /> \n \
<title>xxx</title> \n \
</head> \n \
<body> \n \
<!-- +++++++++++++++++++++++++++++++++++++ -->\n"

        body = ""
        for line in entries:
            body += line + '\n'
        # Replace \" with "
        body = body.replace('\\"', '"')

        # Document footer
        footer = "<!-- +++++++++++++++++++++++++++++++++++++ -->\n \
</body>\n \
</html>"

        # Validate the string
        document, errors = tidy_document(r"%s%s%s" % (header, body, footer),
                                         options={
                                             'numeric-entities': 1,
                                             'output-html': 1,
                                             'alt-text': '',
                                             'wrap': 72
                                         })
        # Write html file.
        if self._writeHTML:
            htmlName = "%s%s" % (moFile[0:-2], "html")
            with open(htmlName, mode="w", encoding="utf-8") as f:
                f.write(document)
        return (document, errors)
コード例 #41
0
def wrap_html(curr_chapter):
    with open(curr_chapter['local'], 'r+', encoding='utf-8') as f:
        content = f.read()
        header = """<?xml version='1.0' encoding='utf-8'?>
            <html xmlns="http://www.w3.org/1999/xhtml">
            <head><title>%s</title></head>
            <body><h1 class="chapter">%s</h1>""" % (curr_chapter['title'],
                                    curr_chapter['title'])
        footer = "</body></html>"
        doc = "%s%s%s" % (header, content, footer)
        clean = tidy_document(doc, options=TIDY_OPTS)
        f.seek(0)
        f.write(clean[0])
        f.truncate()
コード例 #42
0
ファイル: crawler_test.py プロジェクト: ion-plugged/test-NAV
def test_page_should_be_valid_html(page):
    if page.response != 200:
        pytest.skip("not validating non-reachable page")
    if not page.content_type or 'html' not in page.content_type.lower():
        pytest.skip("not attempting to validate non-html page")
    if not should_validate(page.url):
        pytest.skip("skip validation of blacklisted page")
    if not page.content:
        pytest.skip("page has no content")

    document, errors = tidy_document(page.content, TIDY_OPTIONS)
    errors = filter_errors(errors)

    assert not errors, "Found following validation errors:\n" + errors
コード例 #43
0
ファイル: peticiones.py プロジェクト: JaviBF92/TransporBot
def get_stations():
    try:
        web = requests.get(
            'http://horarios.renfe.com/cer/hjcer300.jsp?NUCLEO=30&CP=NO&I=s#',
            timeout=4).text
    except Timeout:
        return None
    else:
        document, errors = tidy_document(web)
        bs = BeautifulSoup(document, 'html.parser')
        estaciones = bs.find('select', {"name": "o"}).findAll('option')
        estaciones_ids = [(option.text.strip().replace(" ", "").lower(),
                           option['value']) for option in estaciones][1:]
        return {key: value for (key, value) in estaciones_ids}
コード例 #44
0
ファイル: process.py プロジェクト: Keson96/ConEx
 def clean(cls, html, tidy=True, body_only=False):
     """
     clean html document
     """
     if body_only:
         cleaner = cls.cleaner_body
     else:
         cleaner = cls.cleaner
     if tidy:
         document, errors = tidy_document(html)
         cleaned = cleaner.clean_html(document)
     else:
         cleaned = cleaner.clean_html(html)
     return cleaned
コード例 #45
0
def carrega_dados_lei(lei):
    response = urllib2.urlopen(lei.get('url'))
    html, errors = tidy_document(response.read(), tidy_options)
    if errors:
        sys.exit(errors)
    parsed_html = BeautifulSoup(html)

    titulo = parsed_html.find('h4').text.strip()
    descricao = parsed_html.find('td', attrs={'id': 'prim_col'}).text.strip()
    dados = {'titulo_completo': titulo, 'descricao': descricao}
    if lei.has_key('voto'):
        del (lei['voto'])
    lei.update(dados)
    return lei
コード例 #46
0
    def htmlCorrection(
        self,
        htmlStr: str,
        substitutions_dict: dict = {
            '"': '\\"',
            '<br>': '<br />',
            '<br/>': '<br />'
        }
    ) -> (str, str):
        """Returns cleaned html code and found errors

        Calls tidylib which will produce a clean version of the html code
        and also the errors that it has found.

        Parameters
        ----------
        htmlStr : str
                The html code as a single string.
        substitutions_dict : dict
                A dictionary with key:value pairs for old and new text.
                The html code must be escaped in Modelica. Generate properly
                escaped code we need to add the escape characters. All the
                while we can replace html errors that Dymola introduces.
                i.e. '<br>' -> '<br />'

        Returns
        -------
        str
                The tidy html code with escape characters as one string.
        str
                The error messages from tidylib.
        """
        from tidylib import tidy_document

        # Validate the string
        htmlCorrect, errors = tidy_document(f"{htmlStr}",
                                            options={
                                                'doctype': 'html5',
                                                'show-body-only': 1,
                                                'numeric-entities': 1,
                                                'output-html': 1,
                                                'wrap': 72,
                                                'alt-text': '',
                                            })

        document_corr = HTML_Tidy.make_string_replacements(
            self, theString=htmlCorrect, substitutions_dict=substitutions_dict)

        return document_corr, errors
コード例 #47
0
 def __parse(self,
             url: str = None,
             html: str = None,
             cookies: str = None) -> (dict, str):
     """
     Make an HTML/URL parsing by processing ALL found tags
     :param url: The url to parse (or None)
     :param html: The html page to parse as string (or None)
     :param cookies: The cookies to use on parsing
     :return: dictionary of tags, cookies
     """
     self.url = None
     self.base_url = None
     is_image = False
     if url is not None:
         self.url = url
         url_parsed = urlparse(url)
         self.url_scheme = str(url_parsed.scheme)
         self.base_url = self.url_scheme + '://' + str(url_parsed.netloc)
         r = HttpRequest.request(url, cookies=cookies)
         if r is None:
             return None
         if r.status_code >= 400 or r.headers.get(
                 'Content-Type') in HtmlParser._unacceptable_content_types:
             return None
         try:
             html = r.json()
             Log.warning('Trying to parse a json with HTML parser!')
         except ValueError:
             html = r.text
         if r.headers is not None:
             for k, v in r.headers.items():
                 if k.lower() == 'set-cookie':
                     cookies = v
         if HttpRequest.is_image(r):
             is_image = True
             xmp_start = html.find('<x:xmpmeta')
             xmp_end = html.find('</x:xmpmeta')
             xmp_str = html[xmp_start:xmp_end + 12]
             html = xmp_str
     if is_image:
         sorted_html = html
     else:
         sorted_html, errors = tidy_document(
             html)  # Sort html (and fix errors)
     self.feed(sorted_html)
     if cookies is None:
         cookies = ''
     return self.tags, cookies
コード例 #48
0
 def validate_html_code(template, mongodb, redis_db):
     _id_key = redis_db.incr('html_log')
     document, errors = tidy_document(template,
                                      options={
                                          'numeric-entities': 1,
                                          'char-encoding': 'utf8'
                                      })
     errors = errors.split('\n')
     mongodb.html_log.insert({
         '_id': _id_key,
         'time': get_time.datetime_to_secs(),
         'tmpl': template,
         'log': errors
     })
     return document
コード例 #49
0
def _tidyHTML(doc):
    body = ''.join(doc)
    body = body.replace('\\"', '"')

    _, errors = tidy_document(r'{0}'.format(body), options=TIDY_OPTIONS)

    error_list = errors.rstrip().split('\n')
    errors = []
    for error in error_list:
        error = error.rstrip().split('\n')
        for err in error:
            if bool(err) and not any(id in err for id in IGNORE_IDS):
                errors.append(err)

    return errors
コード例 #50
0
    def validate(self):
        document, errors = tidylib.tidy_document(self.content,
                                                 options=self.TIDY_OPTIONS)

        for line in errors.splitlines():
            for marker in self.markers:
                pattern = r"line \d+ column \d+ - {}:".format(marker)
                if re.match(pattern, line, flags=re.IGNORECASE):
                    self.log.warning("Tidy report:\n%s", errors)
                    raise ValidateError("invalid HTML content")

        if self.permissive and errors:
            self.log.debug("Tidy report:\n%s", errors)
        self.log.info("Clean HTML document")
        return document  # return its XHTML version
コード例 #51
0
ファイル: get_quizzes.py プロジェクト: njoroge33/py_learn
def get_quiz_content(page):
    try:
        soup = BeautifulSoup(page, 'html.parser')
        q_desc_html = soup.find(class_='story')
        x_c = [tidy_document(str(x)) for x in q_desc_html.find_all('p')]

        x_s = BeautifulSoup(x_c[0][0], 'html.parser')
        q_desc = x_s.find('p').text.strip()
        return [x for x in q_desc.split('.') if x]
    except Exception as err:
        q_c = [x for x in q_desc_html.descendants][:2]
        if isinstance(q_c[1], bs4.element.Tag):
            q_c[1] = q_c[1].text
        else:
            q_c[1] = ''
        return q_c
コード例 #52
0
    def process_item(self, item, spider):
        data = pprint.pformat(item['content'])
        self.write(item, data, "html")

        document, errors = tidy_document(data, options={'numeric-entities': 1})

        self.write(item, document, "html.tidy")
        self.write(item, errors, "html.errors")

        #        my_reporter.read(html=data)
        #        data= my_reporter.report_news()

        self.write(item, "\n".join(item['links']), "links.txt")

        #
        return item
コード例 #53
0
ファイル: validate.py プロジェクト: svopper/SDAB-Validator
def validate(folder, files):
    for fileName in files:
        if fileName.lower().endswith(".html"):
            try:
                file = open(fileName, "r")
                filetext = "".join([s for s in file.readlines()])
                _, errors = tidy_document(filetext,
                                          options={"numeric-entities": 1})
                if len(errors) == 0:
                    return "HTML was successfully validated with no errors\n"
                return "HTML was validated, with following errors:\n  - " + errors.replace(
                    "\n", "\n  - ").strip("  - ")
            except Exception as _:
                return "An error occured while validating html file\n"
            finally:
                file.close()
コード例 #54
0
    def corrigirHTML(self, cvLattesHTML):
        extended_chars = u''.join(
            unichr(c)
            for c in xrange(127, 65536, 1))  # srange(r"[\0x80-\0x7FF]")
        special_chars = ' -' ''
        cvLattesHTML = cvLattesHTML.decode(
            'iso-8859-1', 'replace')  #+extended_chars+special_chars
        #cvLattesHTML  = cvLattesHTML.decode('ascii','replace')+extended_char+special_chars # Wed Jul 25 16:47:39 BRT 2012

        # contornamos alguns erros do HTML da Plataforma Lattes
        cvLattesHTML = cvLattesHTML.replace("<![CDATA[", "")
        cvLattesHTML = cvLattesHTML.replace("]]>", "")
        arquivoHTML, errors = tidy_document(cvLattesHTML,
                                            options={'numeric-entities': 1})
        #print errors
        return arquivoHTML
コード例 #55
0
def checkurl_html(url: str,
                  status_code: Optional[int] = 200,
                  mimetype: Optional[str] = 'text/html',
                  has: List[str] = [],
                  hasnot: List[str] = []) -> str:
    __tracebackhide__ = True
    document = checkurl_text(url, status_code, mimetype, has, hasnot)
    if html_validation:
        for line in tidy_document(document,
                                  options=TIDY_OPTIONS)[1].split('\n'):
            if not line:
                continue
            if 'trimming empty <span>' in line:
                continue
            pytest.fail(f'tidy error: {line}')
    return document
コード例 #56
0
def carrega_dados_politico(idx):
    response = urllib2.urlopen('http://www.excelencias.org.br/@parl.php?id=%s'%idx)
    html, errors = tidy_document(response.read(), tidy_options)
    if errors:
        sys.exit(errors)
    parsed_html = BeautifulSoup(html)

    conteudo = parsed_html.body.find('div', attrs={'id':'conteudo'}).find_all('div', attrs={'id':'contem_parl'})
    if len(conteudo) <= 4:
        return

    bloco_principal = conteudo[0]
    bloco_votacoes = None
    for bloco in conteudo:
        if bloco.find('div', attrs={'id':'contem_titulo_parl'}).text.strip() == u'Como votou matérias no Plenário':
            bloco_votacoes = bloco
    if not bloco_votacoes:
        return

    nome = bloco_principal.find('div', attrs={'id':'contem_titulo_parl'}).text.strip()
    if not nome:
        return
    tabela_votacoes = bloco_votacoes.find('table', attrs={'class':'livre'})

    lista_votos = []
    if tabela_votacoes:
        for linha in tabela_votacoes.find_all('tr'):
            titulo_lei = linha.find('td', attrs={'id':'prim_col'}).text.strip()
            voto = linha.find('td', attrs={'class':'esq'}).text.strip()
            link_lei = linha.find('td', attrs={'id':'prim_col'}).find('a')
            url_lei = ''
            if link_lei:
                cod, num, ano, casa = link_lei.get('href').replace('javascript:parent.traz_pl(', '').split(',')
                url_lei = 'http://www.excelencias.org.br/modulos/parl_projetolei.php?cod=%s&num=%s&ano=%s&casa=%s'
                url_lei = url_lei %(cod.replace("'", ''), num, ano, casa.replace(')', ''))

            lista_votos.append({
                'titulo': titulo_lei,
                'voto': voto,
                'url': url_lei
            })
    dados_politico = {
        'idx': int(idx),
        'nome': nome,
        'votos': lista_votos
    }
    return dados_politico
コード例 #57
0
def tidy_html(html):
    if hasattr(html, 'read'):
        html = html.read()
    html5, errors = tidy_document(
        html,
        options={
            'merge-divs':
            0,  # do not merge nested div elements - preserve semantic block structures
            'output-xml': 0,
            'indent': 1,
            'tidy-mark': 0,
            'wrap': 0,
            'alt-text': '',
            'doctype': 'html5',
            'markup': 1
        })
    return html5
コード例 #58
0
ファイル: validate_html.py プロジェクト: reesporte/bookwyrm
def validate_html(html):
    """run tidy on html"""
    _, errors = tidy_document(
        html.content,
        options={
            "drop-empty-elements": False,
            "warn-proprietary-attributes": False,
        },
    )
    # idk how else to filter out these unescape amp errs
    errors = "\n".join(
        e for e in errors.split("\n")
        if "&book" not in e and "&type" not in e and "id and name attribute"
        not in e and "illegal characters found in URI" not in e
        and "escaping malformed URI reference" not in e)
    if errors:
        raise Exception(errors)
コード例 #59
0
ファイル: modules.py プロジェクト: cash2one/sitecheck
    def process(self, request, response, report):
        global _tidy_available
        #TODO: Hash errors and don't log duplicate error sets (just a reference)
        if response.is_html and _tidy_available:
            try:
                doc, err = tidy_document(response.content,
                                         options=self.options)
            except:
                report.add_error('Unable to parse response')
            else:
                l = err.splitlines()
                if len(l) > 0:
                    for e in l:
                        report.add_message('{0}'.format(
                            re.sub('^line\\b', 'Line', e)))

                    report.add_message('Total: {0}'.format(len(l)))
コード例 #60
0
ファイル: sanitize.py プロジェクト: zwd1990/django-th
def sanitize(html):
    # with from __future__ import unicode_litterals
    # tidy_document does not want other options at all
    # such as div merge char-encoding and so on
    document, errors = tidy_document(
        html, options={"output-xhtml": 1, "force-output": 1})

    try:
        parsed_dom = parseString(document)
        document_element = parsed_dom.documentElement
        remove_prohibited_elements(document_element)
        remove_prohibited_attributes(document_element)
        body = document_element.getElementsByTagName("body")[0]
        body.tagName = "en-note"
        return body.toxml()
    except ExpatError:
        return ''