Ejemplo n.º 1
0
 def test_frag_with_entity(self):
     h = "é"
     expected = "é"
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
     
     expected = "é"
     doc, err = tidy_fragment(h, {'numeric-entities':1})
     self.assertEqual(doc, expected)
Ejemplo n.º 2
0
    def test_frag_with_entity(self):
        h = "é"
        expected = "é"
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)

        expected = "é"
        doc, err = tidy_fragment(h, {'numeric-entities': 1})
        self.assertEqual(doc, expected)
Ejemplo n.º 3
0
    def test_frag_with_unclosed_tag(self):
        h = "<p>hello"
        expected = '''<p>
  hello
</p>'''
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
    def handle(self, *args, **kwargs):
        month_format = '%b'
        day_format = '%d'
        for url in args:
            parts = url.split('/')

            if len(parts) < 4:
                return "URL doesn't parse into at least year/month/day/slug"
            if parts[-1] == "":
                empty = parts.pop()
            slug = parts.pop()
            day = parts.pop()
            month = parts.pop()
            year = parts.pop()
            try:
                tt = time.strptime(
                    '%s-%s-%s' % (year, month, day),
                    '%s-%s-%s' % ('%Y', month_format, day_format))
                date = datetime.date(*tt[:3])
            except ValueError:
                raise Http404
            story = Story.objects.get(publish_date=date, slug=slug)
            try:
                BeautifulSoup(story.body)  # error that happens in paginator
                print "Story HTML is valid."
            except HTMLParseError, e:
                story.body = tidy_fragment(story.body)[0]  # tidy the frag
                print "Story HTML is invalid, fixing and saving story."
                story.save()
                try:
                    BeautifulSoup(
                        story.body)  # error that happens in paginator
                    print "Story HTML is valid."
                except HTMLParseError, e:
                    print "Story HTML was not able to be fixed. Object pk: %s" % story.id
Ejemplo n.º 5
0
 def clean_html_fragment(self, body):
     content, errors = tidy_fragment(body,
                                     options={
                                         "output-xhtml": 1,
                                         "doctype": 'strict'
                                     })
     return content
Ejemplo n.º 6
0
def parse_book_file(href, book):
    block = {}
    book_tree = lxml.html.parse(join(books_dir, href), parser)
    if not 'page_count' in book:
        td = book_tree.xpath(
                "//td[descendant::*[contains(text(), '{}')]]".format(
                    book['title'])
                )
        if len(td):
            td = td[0]
            page_info = td.xpath("descendant::*[contains(text(), 'страниц')]")
            if len(page_info):
                book['page_count'] = patterns[0][1].search(
                        tostring(page_info[0], encoding='unicode')).groups()[0]

    block['annotation'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Аннотация')]]")
    block['contents'] = book_tree.xpath(
            r"//table[descendant::*[contains(text(), 'Содержание')]]")
    for key in block:
        if len(block[key]):
            mark = block[key][-1]
            book[key] = ""
            for element in mark.itersiblings():
                if element.tag == "table":
                    break
                drop_a(element)
                remove_attr(element)
                book[key] += tostring(element, encoding='unicode')
            book[key] = tidy_fragment(clean(book[key]))[0]
    return book
Ejemplo n.º 7
0
def clean( html ):
    if not html:
        return html
    clean = bleach.clean( html, tags = local_config.TAG_WHITELIST, attributes = local_config.ATTRIBUTE_WHITELIST )
    # catches some additional problems
    tidy, warnings = tidylib.tidy_fragment( clean )
    return tidy
Ejemplo n.º 8
0
 def html(cls,
          string,
          show_everything=False,
          translation=gettext.NullTranslations()):  # pylint: disable=unused-argument
     """Parses HTML"""
     out, _ = tidylib.tidy_fragment(string)
     return out
Ejemplo n.º 9
0
def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt
Ejemplo n.º 10
0
def link_title_uid_txt(i):
    if 'alternate' in i:
        link = i['alternate'][0]['href']
    else:
        link = ''
    if 'title' in i:
        title = i['title']
        title = unescape(title)
    else:
        title = '无题'
    rss_uid = i.get('id') or 1
    snippet = i.get('summary') or i.get('content') or None

    if not snippet:
        return

    if snippet:
        htm = snippet['content']
        if not htm:
            return

    htm = txttidy(htm)
    htm = txt_map('<pre', '</pre>', htm, pre_br)
    htm = tidy_fragment(htm, {'indent': 0})[0]
    htm = htm.replace('<br />', '\n')
    txt = htm2txt(htm)

    if not txt:
        return

    return link, title, rss_uid, txt
Ejemplo n.º 11
0
 def test_frag_with_unclosed_tag(self):
     h = "<p>hello"
     expected = '''<p>
   hello
 </p>'''
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
Ejemplo n.º 12
0
    def get_article_text(self, body):
        """
        Gets the article main text
        :param body:
        :return:
        """
        raw_article_body = body.find("div", {"class": "article-body"})

        article_body_no_html = raw_article_body

        if article_body_no_html is not None:
            article_body_no_html = article_body_no_html.get_text()
            article_body_no_html = self.gremlin_zapper.zap_string(article_body_no_html)

        if raw_article_body is not None:
            self.zap_tag_contents(raw_article_body)
            article_body = ''
            for item in raw_article_body.contents:
                article_body += str(item)
        else:
            article_body = ''

        article_body, errors = tidy_fragment(article_body, options={'numeric-entities': 1})

        return article_body, article_body_no_html
Ejemplo n.º 13
0
def sanitize_html(value):
    from BeautifulSoup import BeautifulSoup, Comment, Tag

    # FIXME: 'None' should never be saved as text
    if value is None:
        return ""

    # allowed tags for a Vodafone Live <CONTAINER type="data" />
    # this doubles up as a translation table. CKEditor does new-ish
    # HTML than Vodafone Live will accept. We have to translate 'em' back
    # to 'i', and 'strong' back to 'b'.
    #
    # NOTE: Order is important since <strong>'s can be inside <p>'s.
    tags = (
        ("em", "i"),  # when creating them in the editor they're EMs
        ("strong", "b"),
        ("i", "i"),  # when loading them as I's the editor leaves them
        ("b", "b"),  # we keep them here to prevent them from being removed
        ("u", "u"),
        ("br", "br"),
        ("p", "p"),
    )
    valid_tags = [tag for tag, replacement_tag in tags]
    soup = BeautifulSoup(value)

    # remove all comments from the HTML
    for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
        comment.extract()

    # hide all tags that aren't in the allowed list, but keep
    # their contents
    for tag in soup.findAll(True):
        # Vodafone Live allows for no tag attributes
        tag.attrs = []
        if tag.name not in valid_tags:
            tag.hidden = True

    # replace tags with Vlive equivelants
    for element, replacement_element in tags:
        if element is not replacement_element:
            for tag in soup.findAll(element):
                replacement_tag = Tag(soup, replacement_element)
                replacement_tag.insert(0, tag.text)
                tag.replaceWith(replacement_tag)

    xml = soup.renderContents().decode("utf8")
    fragment, errors = tidy_fragment(xml, {"char-encoding": "utf8"})

    return (
        fragment.replace("&nbsp;", " ")
        .replace("&rsquo;", "'")
        .replace("&lsquo;", "'")
        .replace("&quot;", '"')
        .replace("&ldquo;", '"')
        .replace("&rdquo;", '"')
        .replace("&bull;", "- ")
        .replace("&eacute;", "e")
        .replace("&Eacute;", "E")
        .replace("&ndash;", "-")
    )
Ejemplo n.º 14
0
def parse_book_file(href, book):
    block = {}
    book_tree = lxml.html.parse(join(books_dir, href), parser)
    if not 'page_count' in book:
        td = book_tree.xpath(
            "//td[descendant::*[contains(text(), '{}')]]".format(
                book['title']))
        if len(td):
            td = td[0]
            page_info = td.xpath("descendant::*[contains(text(), 'страниц')]")
            if len(page_info):
                book['page_count'] = patterns[0][1].search(
                    tostring(page_info[0], encoding='unicode')).groups()[0]

    block['annotation'] = book_tree.xpath(
        r"//table[descendant::*[contains(text(), 'Аннотация')]]")
    block['contents'] = book_tree.xpath(
        r"//table[descendant::*[contains(text(), 'Содержание')]]")
    for key in block:
        if len(block[key]):
            mark = block[key][-1]
            book[key] = ""
            for element in mark.itersiblings():
                if element.tag == "table":
                    break
                drop_a(element)
                remove_attr(element)
                book[key] += tostring(element, encoding='unicode')
            book[key] = tidy_fragment(clean(book[key]))[0]
    return book
Ejemplo n.º 15
0
def remove_tags(text):
    from tidylib import tidy_fragment

    import re, html
    text = html.unescape(text)
    text, errors = tidy_fragment(text)
    tag_re = re.compile(r'<[^>]+>')
    return tag_re.sub('', text)
Ejemplo n.º 16
0
    def test_frag_with_unicode_subclass(self):
        class MyUnicode(unicode):
            pass

        h = MyUnicode(u"unicode string ß")
        expected = h
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
Ejemplo n.º 17
0
    def test_frag_with_unicode_subclass(self):
        class MyUnicode(utype):
            pass

        h = MyUnicode("unicode string ß")
        expected = h
        doc, err = tidy_fragment(h)
        self.assertEqual(doc, expected)
Ejemplo n.º 18
0
    def natural_selection(self, generation, gene, eval_place, individual_i):
        """I don't have to be the be the fittest and fastest to survive -- I just have to be fitter and faster than YOU!

        :param generation:
        :param gene:
        :param eval_place:
        :param individual_i:
        :return:
        """
        if not generation or generation is None:
            raise SequencerValidationException("[!] generation is required.")

        if not gene or gene is None:
            raise SequencerValidationException("[!] gene is required.")

        if not eval_place or eval_place is None:
            raise SequencerValidationException("[!] eval_place is required.")

        if not individual_i or individual_i is None:
            raise SequencerValidationException("[!] individual_i is required.")

        sv = SeleniumValidator()
        indiv = gene_to_str(gene, generation.genomes)
        html = self.template.render({eval_place: indiv})
        eval_html_path = os.path.realpath(
            os.path.join(self.html_dir,
                         self.html_file.replace("*", str(individual_i))))

        with open(eval_html_path, "w", encoding="utf-8") as _html:
            _html.write(html)

        payload, errors = tidy_fragment(html)

        warnings = len(re.findall(r"(Warning)\W", errors))
        errors = len(re.findall(r"(Error)\W", errors))

        if warnings > 0:
            warnings = float(warnings) * -0.2  # -0.1
        if errors > 0:
            errors = float(errors) * -1.1  # -1.0
        else:
            return None, 1

        int_score = warnings + errors
        # result = test_payload_with_selenium(self.web_driver, str("file://" + eval_html_path))
        result = sv.validate_payload(
            (self.web_driver, str("file://" + eval_html_path)))
        selenium_score = result["score"]
        if result["error"]:
            return None, 1

        if selenium_score > 0:
            print("[*] Found running script: \"{}\" in {}.".format(
                indiv, eval_place))
            int_score += self.bingo_score
            self.result_list.append([eval_place, generation.genomes, indiv])

        return int_score, 0
Ejemplo n.º 19
0
def addSection(link, title):
    if not 'http' in link:
        page = urllib2.urlopen('http://www.paulgraham.com/' + link).read()
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
    else:
        page = urllib2.urlopen(link).read()

    section = ez_epub.Section()
    try:
        section.title = title
        print section.title

        if not 'http' in link:
            if len(soup.findAll('table', {'width': '435'})) != 0:
                font = str(
                    soup.findAll('table', {'width': '435'})[0].findAll('font')
                    [0]).strip("<font face=\"verdana\" size=\"2\">")
            elif len(soup.findAll('table', {'width': '374'})) != 0:
                font = str(
                    soup.findAll('table', {'width': '374'})[0].findAll('font')
                    [0]).strip("<font face=\"verdana\" size=\"2\">")
            if not 'Get funded by' in font and not 'Watch how this essay was' in font and not 'Like to build things?' in font and not len(
                    font) < 100:
                content = font
            else:
                content = ''
                for par in soup.findAll('p'):
                    content += str(par)
            for p in content.decode('utf-8').split("<br/><br/>"):
                p, error = tidy_fragment(p)
                if p == '</':
                    continue
                if p.__contains__("<xa"):
                    p = p.replace("<xa", "<a")
                section.text.append(genshi.core.Markup(p))
        else:
            for p in str(page).replace("\n", "<br/>").split("<br/><br/>"):
                p, error = tidy_fragment(p)
                if p.__contains__("<xa"):
                    p = p.replace("<xa", "<a")
                section.text.append(genshi.core.Markup(p))
    except Exception, e:
        print str(e)
        pass
Ejemplo n.º 20
0
def tidy_html(html):
    """
    Process an input string containing HTML and return a tuple (xhtml,
    errors, warnings) containing the output of tidylib and lists of
    validation errors and warnings.

    Input must be unicode.
    Output will be valid XHTML.
    """
    if not isinstance(html, unicode):
        raise ValueError("tidyhtml must be called with a Unicode string!")

    warnings = list()

    # First, deal with embedded control codes:
    html, sub_count = CONTROL_CHAR_RE.subn(" ", html)
    if sub_count:
        warnings.append("Stripped %d control characters from body: %s" % (
            sub_count,
            set(ord(i) for i in CONTROL_CHAR_RE.findall(html))
        ))

    html, messages = tidylib.tidy_fragment(
        html.strip(),
        {
            "char-encoding":               "utf8",
            "clean":                        False,
            "drop-empty-paras":             False,
            "drop-font-tags":               True,
            "drop-proprietary-attributes":  False,
            "fix-backslash":                True,
            "indent":                       True,
            "output-xhtml":                 True,
        }
    )

    messages = filter(None, (l.strip() for l in messages.split("\n") if l))

    # postprocess warnings to avoid HTML fragments being reported as lacking
    # doctype and title:
    errors = list()
    warnings = list()

    for msg in messages:
        if "Warning: missing <!DOCTYPE> declaration" in msg:
            continue
        if "Warning: inserting missing 'title' element" in msg:
            continue
        if "Warning: inserting implicit <body>" in msg:
            continue

        if "Error:" in msg:
            errors.append(msg)
        else:
            warnings.append(msg)

    return html, errors, warnings
Ejemplo n.º 21
0
def object_for_typepad_object(tp_obj):
    try:
        obj = Object.objects.get(service='typepad.com', foreign_id=tp_obj.url_id)
    except Object.DoesNotExist:
        pass
    else:
        log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id)
        return False, obj

    log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id, tp_obj.author.display_name)

    author = account_for_typepad_user(tp_obj.author)
    body = tp_obj.rendered_content
    if not body and tp_obj.content:
        if tp_obj.text_format == 'html_convert_linebreaks':
            body = '\n\n'.join(u'<p>%s</p>' % t for t in tp_obj.content.split('\n\n'))
        else:
            body = tp_obj.content
    if body:
        body, errors = tidy_fragment(body)
    else:
        body = ''

    obj = Object(
        service='typepad.com',
        foreign_id=tp_obj.url_id,
        render_mode='mixed',
        title=tp_obj.title,
        body=body,
        time=tp_obj.published,
        permalink_url=tp_obj.permalink_url,
        author=author,
    )

    if getattr(tp_obj, 'in_reply_to', None) is not None:
        # This post is in reply, so we don't care if our referent was
        # really a share. Be transitively in reply to the shared obj.
        really_a_share, obj.in_reply_to = object_for_typepad_object(tp_obj.in_reply_to)
    elif getattr(tp_obj, 'reblog_of', None) is not None:
        # Assets are public so it's okay if we use an anonymous typd here.
        t = typd.TypePad(endpoint='http://api.typepad.com/')
        reblog_of = t.assets.get(tp_obj.reblog_of.url_id)

        really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of)
        remove_reblog_boilerplate_from_obj(obj)
        if not obj.body:
            return True, obj.in_reply_to
    elif getattr(tp_obj, 'reblog_of_url', None) is not None:
        reblog_url = tp_obj.reblog_of_url
        try:
            in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url)
        except leapfrog.poll.embedlam.RequestError, exc:
            in_reply_to = None
        except ValueError, exc:
            in_reply_to = None
            log.error("Error making object from referent %s of %s's post %s", reblog_url, author.display_name, tp_obj.url_id)
            log.exception(exc)
Ejemplo n.º 22
0
 def html(self, string):
     """Parses HTML"""
     if "allow_html" not in INGIniousConfiguration or INGIniousConfiguration["allow_html"] == False:
         raise Exception("HTML is not allowed")
     elif INGIniousConfiguration["allow_html"] == "tidy":
         import tidylib
         out, dummy = tidylib.tidy_fragment(string)
         return out
     else:
         return string
Ejemplo n.º 23
0
def cleanupText(text):
    """This method cleans up the text of the report using libtidy"""
    # tidylib options
    options = dict(output_xhtml=1, add_xml_decl=1, indent=1, tidy_mark=0, char_encoding="utf8", quote_nbsp=0)
    # remove html entities from the text
    ubody_text = unescape(text)
    # clean up xhtml using tidy
    aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False)
    # tidylib returns a <tidy.lib._Document object>
    return str(aftertidy)
Ejemplo n.º 24
0
def html2xhtml(html,**options):
    options.update(doctype='omit')
    options.update(show_warnings=0)
    options.update(indent=0)
    options.update(output_xml=1)
    document, errors = tidy_fragment(html,options=options)
    if errors:
        #~ raise Exception(repr(errors))
        raise Exception("Errors while processing %s\n==========\n%s" % (html,errors))
    return document
Ejemplo n.º 25
0
def fix_open_tags(source):
    """ Fixes missing tags in html fragments. """
    if not source:
        return source

    fixedhtml, errors = tidy_fragment(source)
    if settings.DEBUG and errors:
        errors = filter_tidylib_errors(errors)
        if errors:
            log.debug('Tidylib errors:\n{}'.format(errors))
    return fixedhtml
Ejemplo n.º 26
0
def fix_open_tags(source):
    """ Fixes missing tags in html fragments. """
    if not source:
        return source

    fixedhtml, errors = tidy_fragment(source)
    if settings.DEBUG and errors:
        errors = filter_tidylib_errors(errors)
        if errors:
            log.debug('Tidylib errors:\n{}'.format(errors))
    return fixedhtml
Ejemplo n.º 27
0
def normalize(text):
    """ Normalize whitespace for a string of html using tidylib. """
    output, errors = tidylib.tidy_fragment(text, options={
                                    'drop_empty_paras':0,
                                    'fix_backslash':0,
                                    'fix_bad_comments':0,
                                    'fix_uri':0,
                                    'join_styles':0,
                                    'lower_literals':0,
                                    'merge_divs':0,
                                    'output_xhtml':1,
                                    'quote_ampersand':0,
                                    'newline':'LF'})
    return output
Ejemplo n.º 28
0
        def POST(self):
            """ POST request """
            web.header('Content-Type', 'application/json')

            post_input = web.data()

            try:
                decoded_input = json.loads(post_input)
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode POST</p>"})

            if "xqueue_body" not in decoded_input:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: no xqueue_body in POST</p>"})
            try:
                edx_input = json.loads(decoded_input["xqueue_body"])
                taskid = json.loads(edx_input["grader_payload"])["tid"]
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: cannot decode JSON</p>"})

            try:
                task = course.get_task(taskid)
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: unknown task {}</p>".format(taskid)})

            if not task.input_is_consistent(edx_input):
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: input not consistent with task</p>"})

            try:
                job_return = job_manager_sync.new_job(task, edx_input, "Plugin - EDX")
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error while grading submission</p>"})

            try:
                text = ""
                if "text" in job_return:
                    text = job_return["text"]
                if "problems" in job_return:
                    for prob in job_return["problems"]:
                        text += "<br/><h4>" + job_return["task"].get_problems()[prob].get_name() + "</h4>" + job_return["problems"][prob]

                score = (1 if job_return["result"] == "success" else 0)
                if "score" in job_return:
                    score = job_return["score"]

                import tidylib

                out, dummy = tidylib.tidy_fragment(text, options={'output-xhtml': 1, 'enclose-block-text': 1, 'enclose-text': 1})
                return json.dumps({"correct": (True if (job_return["result"] == "success") else None), "score": score, "msg": out})
            except:
                return json.dumps({"correct": None, "score": 0, "msg": "<p>Internal grader error: error converting submission result</p>"})
Ejemplo n.º 29
0
def normalize(text):
    """ Normalize whitespace for a string of html using tidylib. """
    output, errors = tidylib.tidy_fragment(text, options={
                                    'drop_empty_paras':0,
                                    'fix_backslash':0,
                                    'fix_bad_comments':0,
                                    'fix_uri':0,
                                    'join_styles':0,
                                    'lower_literals':0,
                                    'merge_divs':0,
                                    'output_xhtml':1,
                                    'quote_ampersand':0,
                                    'newline':'LF'})
    return output
Ejemplo n.º 30
0
def cleanupText(text):
    """This method cleans up the text of the report using libtidy"""
    #tidylib options
    options = dict(output_xhtml=1,
                    add_xml_decl=1,
                    indent=1,
                    tidy_mark=0,
                    char_encoding="utf8",
                    quote_nbsp=0)
    #remove html entities from the text
    ubody_text = unescape(text)
    #clean up xhtml using tidy
    aftertidy, errors = tidy_fragment(ubody_text.encode("utf8"), options, keep_doc=False)
    #tidylib returns a <tidy.lib._Document object>
    return str(aftertidy)
Ejemplo n.º 31
0
 def html2xhtml(html, **options):
     options.update(doctype='omit')
     options.update(show_warnings=0)
     options.update(indent=0)
     # options.update(output_xml=1)
     options.update(output_xhtml=1)
     document, errors = tidy_fragment(html, options=options)
     if errors:
         #~ raise Exception(repr(errors))
         raise Exception("Errors while processing %s\n==========\n%s" %
                         (html, errors))
     # if document.startswith(WRAP_BEFORE):
     #     document = document[len(WRAP_BEFORE):]
     #     document = document[:-15]
     return document.strip()
Ejemplo n.º 32
0
 def html2xhtml(html, **options):
     options.update(doctype='omit')
     options.update(show_warnings=0)
     options.update(indent=0)
     # options.update(output_xml=1)
     options.update(output_xhtml=1)
     document, errors = tidy_fragment(html, options=options)
     if errors:
         #~ raise Exception(repr(errors))
         raise Exception("Errors while processing %s\n==========\n%s" %
                         (html, errors))
     # if document.startswith(WRAP_BEFORE):
     #     document = document[len(WRAP_BEFORE):]
     #     document = document[:-15]
     return document.strip()
Ejemplo n.º 33
0
def mytidy(content):
    BASE_OPTIONS = {
        "output-xhtml": 0,     # XHTML instead of HTML4
        "indent": 1,           # Pretty; not too much of a performance hit
        "indent-spaces":4,
        "tab-size":4,
        "tidy-mark": 0,        # No tidy meta tag in output
        "wrap": 0,             # No wrapping
        "alt-text": "",        # Help ensure validation
        "doctype": 'strict',   # Little sense in transitional for tool-generated markup...
        "force-output": 1,     # May not get what you expect but you will get something
        "char-encoding":'utf8',
        "input-encoding":'utf8',
        "output-encoding":'utf8',
        }
    content = tidy_fragment(content, BASE_OPTIONS)
    return content[0]
Ejemplo n.º 34
0
def tidy(soup):
    #put it through HTMLTidy to get nice output
    tidy_output = tidy_fragment(str(soup),
                                options={
                                    'indent': 'auto',
                                    'logical-emphasis': 'yes',
                                    'vertical-space': 'yes',
                                    'fix-uri': 'no'
                                })

    #hacks to get spacing the way I want it, since Tidy puts in too much
    html = tidy_output[0].replace('\n\n\n', '\n\n')
    html = re.sub(r'\n\n(\s*)<ul>', r'\n\1<ul>', html)

    #print Tidy error messages
    print(tidy_output[1])
    return html
Ejemplo n.º 35
0
    def __init__(self, op_html):
        """
        Intializes this option with HTML. The HTML is validated before initializing the option.
        The input HTML should be a snippet and not contain the `html`, `head`, `title`, nor `body` tags.
        Throws an HTMLValidationException if the validation produces errors.

        :param op_html: The string representation of the option HTML.
        :return:
        """

        document, errors = tidy_fragment("<!DOCTYPE html><html><head><title></title><body>%s</body></html>" % op_html)
        # python is stupid
        if len(errors) > 1:
            print errors
            raise HTMLValidationException()
        else:
            Option.__init__(self, op_html)
Ejemplo n.º 36
0
def normalize(text):
    """ Normalize whitespace for a string of html using tidylib. """
    output, errors = tidylib.tidy_fragment(
        text,
        options={
            "drop_empty_paras": 0,
            "fix_backslash": 0,
            "fix_bad_comments": 0,
            "fix_uri": 0,
            "join_styles": 0,
            "lower_literals": 0,
            "merge_divs": 0,
            "output_xhtml": 1,
            "quote_ampersand": 0,
            "newline": "LF",
        },
    )
    return output
Ejemplo n.º 37
0
    def on_message(message):
        try:
            channel_id = message.channel_id.pop()
        except IndexError:
            # No channel_id means it's a private message, skip for now
            return

        channel = mumble_client.channels[channel_id]
        user = mumble_client.users[message.actor]
        message_text = message.message

        html_linted_message_text, _ = tidy_fragment(message_text)

        message_record = Message(user_name=user['name'],
                                 channel_name=channel['name'],
                                 message=html_linted_message_text,
                                 timestamp=datetime.datetime.now())

        session.add(message_record)
        session.commit()
Ejemplo n.º 38
0
 def process_single_node(node, context, typ, src):
     if typ == "text":
         node.text = value_repr(get_element_value(context, src))
     elif typ == "html":
         raw_value = get_element_value(context, src, "")
         if raw_value:
             html_element = html.fragments_fromstring(
                 "<div>%s</div>" % tidy_fragment(raw_value)[0])[0]
             for (key, value) in node.attrib.iteritems():
                 html_element.attrib[key] = value
             node.addnext(html_element)
             node.insert(0, html_element)
     # !+ANCHOR(mr, sep-2014) this should be type "anchor" not "link" !!
     # There is another HTML element "link" that is something else altogether...
     # e.g. <link rel="stylesheet" type="text/css" href="/browserref.css">
     elif typ == "link":
         src_url = get_attr(node, "url")
         if src_url:
             link_url = get_element_value(context, src_url)
         else:
             link_url = url.absoluteURL(context, request)
         node.attrib["href"] = link_url
         if src:
             node.text = get_element_value(context, src)
     # For outputting elements that have an @src attribute to an external
     # resource, such as <img>, <script>, ... resolves @src to the
     # designated resource base url, as per configuration. Any additional
     # attrs needed to be output are specified verbatim in the template.
     elif typ == "src":
         src_url = get_attr(node, "src")
         assert src_url is not None, \
             "Node %s attribute %r is invalid. Check report template." % (
                 node, typ)
         parsed_url = urlparse.urlparse(src_url)
         if not parsed_url.path.startswith("/"):
             node.attrib["src"] = urlparse.urljoin(
                 "/@@/reporting-static/", src_url)
         else:
             # absolute or external, pass on as is
             node.attrib["src"] = src_url
     clean_element(node)
Ejemplo n.º 39
0
 def process_single_node(node, context, typ, src):
     if typ == "text":
         node.text = value_repr(get_element_value(context, src))
     elif typ == "html":
         raw_value = get_element_value(context, src, "")
         if raw_value:
             html_element = html.fragments_fromstring("<div>%s</div>" % 
                 tidy_fragment(raw_value)[0]
             )[0]
             for (key, value) in node.attrib.iteritems():
                 html_element.attrib[key] = value
             node.addnext(html_element)
             node.insert(0, html_element)
     # !+ANCHOR(mr, sep-2014) this should be type "anchor" not "link" !!
     # There is another HTML element "link" that is something else altogether...
     # e.g. <link rel="stylesheet" type="text/css" href="/browserref.css">
     elif typ == "link":
         src_url = get_attr(node, "url")
         if src_url:
             link_url = get_element_value(context, src_url)
         else:
             link_url = url.absoluteURL(context, request)
         node.attrib["href"] = link_url
         if src:
             node.text = get_element_value(context, src)
     # For outputting elements that have an @src attribute to an external
     # resource, such as <img>, <script>, ... resolves @src to the 
     # designated resource base url, as per configuration. Any additional
     # attrs needed to be output are specified verbatim in the template.
     elif typ == "src":
         src_url = get_attr(node, "src")
         assert src_url is not None, \
             "Node %s attribute %r is invalid. Check report template." % ( 
                 node, typ)
         parsed_url = urlparse.urlparse(src_url)
         if not parsed_url.path.startswith("/"):
             node.attrib["src"] = urlparse.urljoin("/@@/reporting-static/", src_url)
         else:
             # absolute or external, pass on as is
             node.attrib["src"] = src_url
     clean_element(node)
Ejemplo n.º 40
0
 def process_single_node(node, context, typ, src):
     clean_element(node)
     if typ == "text":
         node.text = value_repr(get_element_value(context, src))
     elif typ == "html":
         raw_value = get_element_value(context, src, "")
         if raw_value:
             html_element = html.fragments_fromstring("<div>%s</div>" % tidy_fragment(raw_value)[0])[0]
             for (key, value) in node.attrib.iteritems():
                 html_element.attrib[key] = value
             node.addnext(html_element)
             node.insert(0, html_element)
     elif type == "link":
         url_src = get_attr(node, "url")
         if url_src:
             link_url = get_element_value(context, url_src)
         else:
             link_url = url.absoluteURL(context, request)
         node.attrib["href"] = link_url
         if src:
             node.text = get_element_value(context, src)
Ejemplo n.º 41
0
    def clean(self):
        cleaned_data = super().clean()
        # clean HTML in some fields
        for field in ['shortdescr', 'yandexdescr', 'descr', 'spec', 'manuals', 'state', 'complect', 'stitches', 'dealertxt', 'sm_display', 'sm_software']:
            value = cleaned_data.get(field)
            if not value:
                continue
            fragment, errors = tidy_fragment(value, options={'indent': 0})
            if not fragment:
                self.add_error(field, forms.ValidationError("Ошибка очистки HTML"))
                continue
            cleaned_data[field] = fragment

        code = cleaned_data.get('code')
        reg = re.compile(r'[-\.\w]+')
        # test for code presence is required for mass edit
        if code and not reg.fullmatch(code):
            self.add_error('code', forms.ValidationError("Код товара содержит недопустимые символы"))
        # detect import lock - do not allow save during import
        if cache.get("celery-single-instance-import1c") is not None:
            self.add_error(None, forms.ValidationError("Сохранение невозможно во время импорта склада, попробуйте позже."))
        return cleaned_data
Ejemplo n.º 42
0
 def process_single_node(node, context, typ, src):
     clean_element(node)
     if typ == "text":
         node.text = get_element_value(context, src)
     elif typ == "html":
         raw_value = get_element_value(context, src, "")
         if raw_value:
             html_element = etree.fromstring(
                 "<div>%s</div>" % tidy_fragment(raw_value)[0])
             for (key, value) in node.attrib.iteritems():
                 html_element.attrib[key] = value
             node.addnext(html_element)
             node.insert(0, html_element)
     elif type == "link":
         url_src = get_attr(node, "url")
         if url_src:
             link_url = get_element_value(context, url_src)
         else:
             link_url = url.absoluteURL(context, request)
         node.attrib["href"] = link_url
         if src:
             node.text = get_element_value(context, src)
Ejemplo n.º 43
0
def cmd_tidy(root, **kwargs):
    default_options = {
        'clean': 0,
        'drop-empty-elements': 0,
        'drop-empty-paras': 0,
        'drop-proprietary-attributes': 1,
        'logical-emphasis': 0,
        'merge-divs': 0,
        'merge-spans': 0,
        'anchor-as-name': 1,
        'coerce-endtags': 1,
        'custom-tags': 'blocklevel',
        'enclose-block-text': 0,
        'enclose-text': 0,
        'escape-scripts': 1,
        'fix-backslash': 1,
        'fix-style-tags': 1,
        'fix-uri': 1,
        'literal-attributes': 0,
        'uppercase-attributes': 0,
        'uppercase-tags': 0,
        'hide-comments': 1,
        'join-classes': 1,
        'join-styles': 1,
        'merge-emphasis': 0,
        'replace-color': 0,
        'break-before-br': 0,
        'indent': 0,
        'indent-attributes': 0,
        'keep-tabs': 0,
        'omit-optional-tags': 0,
        'tidy-mark': 0,
        'vertical-space': 0
    }
    options = {**default_options, **kwargs}
    return fromstring(tidy_fragment(tostring(root), options=options)[0])
Ejemplo n.º 44
0
    def handle(self, *args, **options):
        num = 0
        for product in Product.objects.all():
            changed = False
            for field in [
                    'shortdescr', 'yandexdescr', 'descr', 'spec', 'state',
                    'complect', 'stitches', 'sm_display', 'sm_software'
            ]:
                value = product.__dict__[field]
                if not value:
                    continue
                fragment, errors = tidy_fragment(value, options={'indent': 0})
                if not fragment:
                    self.stdout.write('{}: {}'.format(str(product), field))
                    continue
                if md5(value.encode('utf-8')).hexdigest() != md5(
                        fragment.encode('utf-8')).hexdigest():
                    product.__dict__[field] = fragment
                    changed = True
            if changed:
                product.save()
                num = num + 1

        self.stdout.write('Successfully updated %d products' % num)
Ejemplo n.º 45
0
def tidy(fragment):
    html, errors = tidy_fragment(fragment)
    return html
Ejemplo n.º 46
0
def render_get_caught_up():
    '''
    Render the prose for the get-caught-up info box
    The Google Sheet that powers this will be regularly re-downloaded
    '''
    copy = copytext.Copy(app_config.CALENDAR_PATH)
    sheet = copy['get_caught_up']
    serialized_data = json.loads(sheet.json())

    is_valid = True
    markup_fields = [
        'intro_1', 'intro_2', 'bullet_1', 'bullet_2', 'bullet_3', 'bullet_4',
        'bullet_5'
    ]
    markup_errors_found = None
    # Note that despite its name, tidy_fragment() requires a valid html document or else
    # it will throw markup validation errors. The documentation at http://countergram.github.io/pytidylib/
    # did not address this seeming discrepancy.
    for field in markup_fields:
        document, errors = tidy_fragment(
            '<!DOCTYPE html><html><head><title>test</title></head><body>%s</body></html>'
            % serialized_data[field])
        if errors:
            is_valid = False
            markup_errors_found = errors
            break

    # Don't publish if that option is off, or if a syntax error is found
    if serialized_data.get('published', '').lower() == 'yes' and is_valid:
        meta = {
            'is_valid_markup': is_valid,
            'published': serialized_data['published'],
            'last_updated': datetime.utcnow()
        }
        content = {
            k: v.strip()
            for k, v in serialized_data.items() if k in markup_fields
        }

        _write_json_file({
            'meta': meta,
            'content': content
        }, 'get-caught-up.json')

    # Publish a debug version to help editors gauge length of content
    # If there are no markup errors and `published` is `True`, the contents
    # of this file will be identical to that of the main GCU file
    meta = {
        'is_valid_markup': is_valid,
        'published': serialized_data['published'],
        'last_updated': datetime.utcnow()
    }
    content = {
        k: v.strip()
        for k, v in serialized_data.items() if k in markup_fields
    } if is_valid else "The HTML markup is invalid. Errors:\n{}".format(
        markup_errors_found)

    _write_json_file({
        'meta': meta,
        'content': content
    }, 'get-caught-up-debug.json')
Ejemplo n.º 47
0
      subject = subject + " @" + notebook

   msg_body = ""
   msg_body = msg_body + '<note><title>'+subject+'</title><content><![CDATA[<?xml version="1.0" encoding="'+char_encoding+'" standalone="no"?> <!DOCTYPE en-note SYSTEM "http://xml.evernote.com/pub/enml2.dtd"> <en-note>'
   msg_url = ""

   if 'canonical' in s.keys():
      d = s["canonical"][0]
      msg_url = d["href"].encode(char_encoding, 'replace')
   if 'alternate' in s.keys():
      d = s["alternate"][0]
      msg_url = d["href"].encode(char_encoding, 'replace')
   if 'summary' in s.keys():
      d = s["summary"]
      dirtyHtml = d["content"]
      cleanHtml, errors = tidy_fragment(dirtyHtml)
      msg_body = msg_body + cleanHtml.encode(char_encoding, 'replace')
   if 'content' in s.keys():
      d = s["content"]
      dirtyHtml = d["content"]
      cleanHtml, errors = tidy_fragment(dirtyHtml)
      msg_body = msg_body + cleanHtml.encode(char_encoding, 'replace')
   msg_body = msg_body + "</en-note>]]>\r\n</content>\r\n"
   if published_datetime:
      msg_body = msg_body + "<created>" + published_datetime + "</created>"
   if updated_datetime:
      msg_body = msg_body + "<updated>" + updated_datetime + "</updated>"
   msg_body = msg_body + "<note-attributes><source>web.clip</source><source-url>" + escape(msg_url) + "</source-url></note-attributes>"
   msg_body = msg_body + "</note>\r\n"

   print(msg_body)
Ejemplo n.º 48
0
def object_for_typepad_object(tp_obj):
    try:
        obj = Object.objects.get(service='typepad.com',
                                 foreign_id=tp_obj.url_id)
    except Object.DoesNotExist:
        pass
    else:
        log.debug("Reusing typepad object %r for asset %s", obj, tp_obj.url_id)
        return False, obj

    log.debug("Making new object for TypePad post %s by %s", tp_obj.url_id,
              tp_obj.author.display_name)

    author = account_for_typepad_user(tp_obj.author)
    body = tp_obj.rendered_content
    if not body and tp_obj.content:
        if tp_obj.text_format == 'html_convert_linebreaks':
            body = '\n\n'.join(u'<p>%s</p>' % t
                               for t in tp_obj.content.split('\n\n'))
        else:
            body = tp_obj.content
    if body:
        body, errors = tidy_fragment(body)
    else:
        body = ''

    obj = Object(
        service='typepad.com',
        foreign_id=tp_obj.url_id,
        render_mode='mixed',
        title=tp_obj.title,
        body=body,
        time=tp_obj.published,
        permalink_url=tp_obj.permalink_url,
        author=author,
    )

    if getattr(tp_obj, 'in_reply_to', None) is not None:
        # This post is in reply, so we don't care if our referent was
        # really a share. Be transitively in reply to the shared obj.
        really_a_share, obj.in_reply_to = object_for_typepad_object(
            tp_obj.in_reply_to)
    elif getattr(tp_obj, 'reblog_of', None) is not None:
        # Assets are public so it's okay if we use an anonymous typd here.
        t = typd.TypePad(endpoint='http://api.typepad.com/')
        reblog_of = t.assets.get(tp_obj.reblog_of.url_id)

        really_a_share, obj.in_reply_to = object_for_typepad_object(reblog_of)
        remove_reblog_boilerplate_from_obj(obj)
        if not obj.body:
            return True, obj.in_reply_to
    elif getattr(tp_obj, 'reblog_of_url', None) is not None:
        reblog_url = tp_obj.reblog_of_url
        try:
            in_reply_to = leapfrog.poll.embedlam.object_for_url(reblog_url)
        except leapfrog.poll.embedlam.RequestError, exc:
            in_reply_to = None
        except ValueError, exc:
            in_reply_to = None
            log.error("Error making object from referent %s of %s's post %s",
                      reblog_url, author.display_name, tp_obj.url_id)
            log.exception(exc)
Ejemplo n.º 49
0
def check_html(text):
    document, err = tidy_fragment(text,options={'numeric-entities':1})
    for l in err.split("\n"):
        if (re.search("missing </",l)):
            return 0
    return 1
Ejemplo n.º 50
0
 def html(cls, string, show_everything=False):
     """Parses HTML"""
     out, _ = tidylib.tidy_fragment(string)
     return out
Ejemplo n.º 51
0
 def test_frag_with_incomplete_img_tag(self):
     h = "<img src='foo'>"
     expected = '''<img src='foo' alt="" />'''
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
Ejemplo n.º 52
0
def tidy_html(html):
    doc, _ = tidylib.tidy_fragment(html, options={'indent': 0})
    return doc
Ejemplo n.º 53
0
 def test_frag_with_unicode(self):
     h = "unicode string ß"
     expected = h
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
Ejemplo n.º 54
0
 def test_tidy_fragment(self):
     h = "<p>hello"
     for i in xrange(100):
         doc, err = tidy_fragment(h)
     self.assertEqual(sink.sinks, {})
Ejemplo n.º 55
0
 def html(cls, string, show_everything=False):
     """Parses HTML"""
     out, _ = tidylib.tidy_fragment(string)
     return out
Ejemplo n.º 56
0
 def html(cls, string):
     """Parses HTML"""
     out, _ = tidylib.tidy_fragment(string)
     return out
Ejemplo n.º 57
0
 def test_tidy_fragment(self):
     h = "<p>hello"
     for i in range(100):
         doc, err = tidy_fragment(h)
     self.assertEqual(sink.sinks, {})
Ejemplo n.º 58
0
import glob
import sys
from tidylib import tidy_document, tidy_fragment

options = {
    "indent": "auto",
    "indent-spaces": 4,
    "markup": True,
    "output-xml": False,
    "input-xml": False,
    "show-warnings": True,
    "numeric-entities": True,
    "quote-marks": True,
    "quote-nbsp": True,
    "quote-ampersand": False,
    "break-before-br": False,
    "uppercase-tags": False,
    "uppercase-attributes": False,
}
try:
    file_type = sys.argv[1]
except:
    file_type = "html"

for f in glob.glob("*." + file_type):
    with open(f) as htmlFragment:
        htmlFragment, errors = tidy_fragment(htmlFragment.read(), options)
        f = open(f, "w")
        f.write(htmlFragment)
Ejemplo n.º 59
0
 def test_frag_with_unicode(self):
     h = u"unicode string ß"
     expected = h
     doc, err = tidy_fragment(h)
     self.assertEqual(doc, expected)
Ejemplo n.º 60
0
 def clean_html_fragment(self, body):
     content, errors = tidy_fragment(body,
                                     options={"output-xhtml": 1,
                                              "doctype": 'strict'})
     return content