Beispiel #1
0
    def test_data_uri(self):
        # Strip out all data URIs.
        html = '<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==">'
        self.assertHTMLEqual(sanitizer.sanitize_html_to_editable(html), "<img/>")

        # Strip out non-image data URI's
        html = '<img src="data:application/pdf;base64,blergh">'
        self.assertHTMLEqual(sanitizer.sanitize_html_to_editable(html), "<img/>")
Beispiel #2
0
    def test_data_uri(self):
        # Strip out all data URIs.
        html = '<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==">'
        self.assertHTMLEqual(sanitizer.sanitize_html_to_editable(html),
                             "<img/>")

        # Strip out non-image data URI's
        html = '<img src="data:application/pdf;base64,blergh">'
        self.assertHTMLEqual(sanitizer.sanitize_html_to_editable(html),
                             "<img/>")
Beispiel #3
0
    def test_clean(self):
        dirty = """
            <script>unsafe</script>
            <style>html {background-color: pink !important;}</style>
            <h1 class='obtrusive'>Something</h1>
            <h2>OK</h2>
            &amp;
            &rdquo;
            <a href='javascript:alert("Oh no")'>This stuff</a>
            <a href='http://google.com'>That guy</a>
            <section>
              <h3>This should show up</h3>
            </section>
        """

        self.assertHTMLEqual(
            sanitizer.sanitize_html_to_editable(dirty), u"""
            <h1>Something</h1>
            <h2>OK</h2>
            &amp;
            \u201d
            <a target="_blank" rel="nofollow">This stuff</a>
            <a href="http://google.com" target="_blank" rel="nofollow">That guy</a>
            <h3>This should show up</h3>
        """)
Beispiel #4
0
    def save(self, *args, **kwargs):
        if self.markdown and not self.html:
            self.html = markdown.markdown(self.markdown)
        if self.note.is_editable():
            self.html = sanitizer.sanitize_html_to_editable(self.html)
        else:
            self.html = sanitizer.sanitize_html_preserve_formatting(self.html)

        super(NoteMarkdown, self).save(*args, **kwargs)
Beispiel #5
0
    def save(self, *args, **kwargs):
        if self.markdown and not self.html:
            self.html = markdown.markdown(self.markdown)
        if self.note.is_editable():
            self.html = sanitizer.sanitize_html_to_editable(self.html)
        else:
            self.html = sanitizer.sanitize_html_preserve_formatting(self.html)

        super(NoteMarkdown, self).save(*args, **kwargs)
Beispiel #6
0
    def test_clean(self):
        dirty = """
            <script>unsafe</script>
            <style>html {background-color: pink !important;}</style>
            <h1 class='obtrusive'>Something</h1>
            <h2>OK</h2>
            &amp;
            &rdquo;
            <a href='javascript:alert("Oh no")'>This stuff</a>
            <a href='http://google.com'>That guy</a>
            <section>
              <h3>This should show up</h3>
            </section>
        """

        self.assertHTMLEqual(sanitizer.sanitize_html_to_editable(dirty), u"""
            <h1>Something</h1>
            <h2>OK</h2>
            &amp;
            \u201d
            <a target="_blank" rel="nofollow">This stuff</a>
            <a href="http://google.com" target="_blank" rel="nofollow">That guy</a>
            <h3>This should show up</h3>
        """)
    def forwards(self, orm):
        "Write your forwards methods here."
        # Note: Don't use "from appname.models import ModelName". 
        # Use orm.ModelName to refer to models in this application,
        # and orm['appname.ModelName'] for models in other applications.

        # keep score. save as lists for debugging purposes if needed.
        good = []
        edit = []
        nonedit = []
        bad = []

        # at the time of migration, editable categories are limited to
        EDITABLE_CATEGORIES = ('LECTURE_NOTES',)

        # at the time of migration, translated PDFs were based on mimetypes
        PDF_MIMETYPES = (
          'application/pdf',
          'application/vnd.ms-powerpoint',
          'application/vnd.openxmlformats-officedocument.presentationml.presentation'
        )

        necessary_notes = orm['notes.Note'].objects.filter(notemarkdown__html__isnull=True)
        n_notes = necessary_notes.count()

        # perform migration in discrete chunks to deal with the transaction
        # (just delete the migration from the south table and run again)
        limitkey = 'NOTE_LIMIT_0021'
        sys.stdout.write('Running until ')
        if os.environ.has_key(limitkey):
            max_notes = int(os.environ[limitkey])
            display_counts(max_notes, n_notes)
        else:
            max_notes = n_notes
            display_counts(n_notes, n_notes)

        # visualiation to show how well this is moving through a large database.
        counter = 0
        display_counts(counter, max_notes)
        # find each Note without an html field, download its S3 html, and
        # store it in the local database.
        for note in necessary_notes:
            # download the s3 content
            html = ''
            # copy/pasted from model code for Note.get_relative_s3_path
            note_s3_path = 'html/{0}.html'.format(note.slug)
            sys.stdout.write(':')
            sys.stdout.flush()
            key = default_storage.bucket.get_key(note_s3_path)
            if key:
                html = key.read()

            # check the downloaded html
            if not html:
                sys.stdout.write('( ')
                bad.append(note.slug)
                counter = counter + 1
                continue
            else:
                good.append(note.slug)

            # clean the html in a consistent way with note uploads as of the
            # time of this migration.
            # handle embedded images from pdf2htmlEX or other sources
            html = sanitizer.data_uris_to_s3(html)
            if note.category in EDITABLE_CATEGORIES:
                # make HTML editable
                html = sanitizer.sanitize_html_to_editable(html)
                sys.stdout.write(']')
                edit.append(note)
            else:
                # clean up HTML without concern for editing
                html = sanitizer.sanitize_html_preserve_formatting(html)
                sys.stdout.write(')')
                nonedit.append(note)

            # store the html in the corresponding NoteMarkdown object
            nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0]
            nmd.html = html
            nmd.save()

            # manage the display
            counter = counter + 1
            sys.stdout.write(' ')
            # track 20 notes per line
            if counter % 20 == 0:
                # finish off previous line and start new line
                display_counts(counter, max_notes)
                # flush per line, just in case it isn't outputting
                sys.stdout.flush()

            # perform migration in discrete chunks to deal with the transaction
            if counter == max_notes:
                break

        # Display the score
        print "Migrated {0} notes and failed to migrate {1} notes.".format(
          len(good), len(bad))
        print "Of good notes, {0} are editable and {1} are not.".format(
          len(edit), len(nonedit))

        print "Failed list:"
        for slug in bad:
            print slug
Beispiel #8
0
    def forwards(self, orm):
        "Write your forwards methods here."
        # Note: Don't use "from appname.models import ModelName".
        # Use orm.ModelName to refer to models in this application,
        # and orm['appname.ModelName'] for models in other applications.

        # keep score. save as lists for debugging purposes if needed.
        good = []
        edit = []
        nonedit = []
        bad = []

        # at the time of migration, editable categories are limited to
        EDITABLE_CATEGORIES = ('LECTURE_NOTES', )

        # at the time of migration, translated PDFs were based on mimetypes
        PDF_MIMETYPES = (
            'application/pdf', 'application/vnd.ms-powerpoint',
            'application/vnd.openxmlformats-officedocument.presentationml.presentation'
        )

        necessary_notes = orm['notes.Note'].objects.filter(
            notemarkdown__html__isnull=True)
        n_notes = necessary_notes.count()

        # perform migration in discrete chunks to deal with the transaction
        # (just delete the migration from the south table and run again)
        limitkey = 'NOTE_LIMIT_0021'
        sys.stdout.write('Running until ')
        if os.environ.has_key(limitkey):
            max_notes = int(os.environ[limitkey])
            display_counts(max_notes, n_notes)
        else:
            max_notes = n_notes
            display_counts(n_notes, n_notes)

        # visualiation to show how well this is moving through a large database.
        counter = 0
        display_counts(counter, max_notes)
        # find each Note without an html field, download its S3 html, and
        # store it in the local database.
        for note in necessary_notes:
            # download the s3 content
            html = ''
            # copy/pasted from model code for Note.get_relative_s3_path
            note_s3_path = 'html/{0}.html'.format(note.slug)
            sys.stdout.write(':')
            sys.stdout.flush()
            key = default_storage.bucket.get_key(note_s3_path)
            if key:
                html = key.read()

            # check the downloaded html
            if not html:
                sys.stdout.write('( ')
                bad.append(note.slug)
                counter = counter + 1
                continue
            else:
                good.append(note.slug)

            # clean the html in a consistent way with note uploads as of the
            # time of this migration.
            # handle embedded images from pdf2htmlEX or other sources
            html = sanitizer.data_uris_to_s3(html)
            if note.category in EDITABLE_CATEGORIES:
                # make HTML editable
                html = sanitizer.sanitize_html_to_editable(html)
                sys.stdout.write(']')
                edit.append(note)
            else:
                # clean up HTML without concern for editing
                html = sanitizer.sanitize_html_preserve_formatting(html)
                sys.stdout.write(')')
                nonedit.append(note)

            # store the html in the corresponding NoteMarkdown object
            nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0]
            nmd.html = html
            nmd.save()

            # manage the display
            counter = counter + 1
            sys.stdout.write(' ')
            # track 20 notes per line
            if counter % 20 == 0:
                # finish off previous line and start new line
                display_counts(counter, max_notes)
                # flush per line, just in case it isn't outputting
                sys.stdout.flush()

            # perform migration in discrete chunks to deal with the transaction
            if counter == max_notes:
                break

        # Display the score
        print "Migrated {0} notes and failed to migrate {1} notes.".format(
            len(good), len(bad))
        print "Of good notes, {0} are editable and {1} are not.".format(
            len(edit), len(nonedit))

        print "Failed list:"
        for slug in bad:
            print slug