def save(self, *args, **kwargs): if self.markdown and not self.html: self.html = markdown.markdown(self.markdown) if self.note.is_editable(): self.html = sanitizer.sanitize_html_to_editable(self.html) else: self.html = sanitizer.sanitize_html_preserve_formatting(self.html) super(NoteMarkdown, self).save(*args, **kwargs)
def save(self, *args, **kwargs): if self.markdown and not self.html: self.html = markdown.markdown(self.markdown) if self.note.is_editable(): self.html = sanitizer.sanitize_html_to_editable(self.html) else: self.html = sanitizer.sanitize_html_preserve_formatting(self.html) super(NoteMarkdown, self).save(*args, **kwargs)
def test_font_face_data_uri(self): # Note: this data-uri is not a valid font (it's the red dot). html = '''<style>@font-face { src: url('data:application/font-woff;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=='); }</style>''' s3ified = sanitizer.data_uris_to_s3(html) self.assertFalse(re.search(r"url\('data:application", s3ified), "data URL not removed: {}".format(s3ified)) self.assertTrue(re.search(r"url\('https?://[^\)]+\)", s3ified), "URL not inserted: {}".format(s3ified)) # Ensure that cleaning is idempotent. self.assertHTMLEqual( s3ified, sanitizer.sanitize_html_preserve_formatting(s3ified))
def test_font_face_data_uri(self): # Note: this data-uri is not a valid font (it's the red dot). html = '''<style>@font-face { src: url('data:application/font-woff;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=='); }</style>''' s3ified = sanitizer.data_uris_to_s3(html) self.assertFalse(re.search(r"url\('data:application", s3ified), "data URL not removed: {}".format(s3ified)) self.assertTrue(re.search(r"url\('https?://[^\)]+\)", s3ified), "URL not inserted: {}".format(s3ified)) # Ensure that cleaning is idempotent. self.assertHTMLEqual(s3ified, sanitizer.sanitize_html_preserve_formatting(s3ified))
def forwards(self, orm): "Write your forwards methods here." # Note: Don't use "from appname.models import ModelName". # Use orm.ModelName to refer to models in this application, # and orm['appname.ModelName'] for models in other applications. # keep score. save as lists for debugging purposes if needed. good = [] edit = [] nonedit = [] bad = [] # at the time of migration, editable categories are limited to EDITABLE_CATEGORIES = ('LECTURE_NOTES',) # at the time of migration, translated PDFs were based on mimetypes PDF_MIMETYPES = ( 'application/pdf', 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ) necessary_notes = orm['notes.Note'].objects.filter(notemarkdown__html__isnull=True) n_notes = necessary_notes.count() # perform migration in discrete chunks to deal with the transaction # (just delete the migration from the south table and run again) limitkey = 'NOTE_LIMIT_0021' sys.stdout.write('Running until ') if os.environ.has_key(limitkey): max_notes = int(os.environ[limitkey]) display_counts(max_notes, n_notes) else: max_notes = n_notes display_counts(n_notes, n_notes) # visualiation to show how well this is moving through a large database. counter = 0 display_counts(counter, max_notes) # find each Note without an html field, download its S3 html, and # store it in the local database. for note in necessary_notes: # download the s3 content html = '' # copy/pasted from model code for Note.get_relative_s3_path note_s3_path = 'html/{0}.html'.format(note.slug) sys.stdout.write(':') sys.stdout.flush() key = default_storage.bucket.get_key(note_s3_path) if key: html = key.read() # check the downloaded html if not html: sys.stdout.write('( ') bad.append(note.slug) counter = counter + 1 continue else: good.append(note.slug) # clean the html in a consistent way with note uploads as of the # time of this migration. # handle embedded images from pdf2htmlEX or other sources html = sanitizer.data_uris_to_s3(html) if note.category in EDITABLE_CATEGORIES: # make HTML editable html = sanitizer.sanitize_html_to_editable(html) sys.stdout.write(']') edit.append(note) else: # clean up HTML without concern for editing html = sanitizer.sanitize_html_preserve_formatting(html) sys.stdout.write(')') nonedit.append(note) # store the html in the corresponding NoteMarkdown object nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0] nmd.html = html nmd.save() # manage the display counter = counter + 1 sys.stdout.write(' ') # track 20 notes per line if counter % 20 == 0: # finish off previous line and start new line display_counts(counter, max_notes) # flush per line, just in case it isn't outputting sys.stdout.flush() # perform migration in discrete chunks to deal with the transaction if counter == max_notes: break # Display the score print "Migrated {0} notes and failed to migrate {1} notes.".format( len(good), len(bad)) print "Of good notes, {0} are editable and {1} are not.".format( len(edit), len(nonedit)) print "Failed list:" for slug in bad: print slug
def forwards(self, orm): "Write your forwards methods here." # Note: Don't use "from appname.models import ModelName". # Use orm.ModelName to refer to models in this application, # and orm['appname.ModelName'] for models in other applications. # keep score. save as lists for debugging purposes if needed. good = [] edit = [] nonedit = [] bad = [] # at the time of migration, editable categories are limited to EDITABLE_CATEGORIES = ('LECTURE_NOTES', ) # at the time of migration, translated PDFs were based on mimetypes PDF_MIMETYPES = ( 'application/pdf', 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ) necessary_notes = orm['notes.Note'].objects.filter( notemarkdown__html__isnull=True) n_notes = necessary_notes.count() # perform migration in discrete chunks to deal with the transaction # (just delete the migration from the south table and run again) limitkey = 'NOTE_LIMIT_0021' sys.stdout.write('Running until ') if os.environ.has_key(limitkey): max_notes = int(os.environ[limitkey]) display_counts(max_notes, n_notes) else: max_notes = n_notes display_counts(n_notes, n_notes) # visualiation to show how well this is moving through a large database. counter = 0 display_counts(counter, max_notes) # find each Note without an html field, download its S3 html, and # store it in the local database. for note in necessary_notes: # download the s3 content html = '' # copy/pasted from model code for Note.get_relative_s3_path note_s3_path = 'html/{0}.html'.format(note.slug) sys.stdout.write(':') sys.stdout.flush() key = default_storage.bucket.get_key(note_s3_path) if key: html = key.read() # check the downloaded html if not html: sys.stdout.write('( ') bad.append(note.slug) counter = counter + 1 continue else: good.append(note.slug) # clean the html in a consistent way with note uploads as of the # time of this migration. # handle embedded images from pdf2htmlEX or other sources html = sanitizer.data_uris_to_s3(html) if note.category in EDITABLE_CATEGORIES: # make HTML editable html = sanitizer.sanitize_html_to_editable(html) sys.stdout.write(']') edit.append(note) else: # clean up HTML without concern for editing html = sanitizer.sanitize_html_preserve_formatting(html) sys.stdout.write(')') nonedit.append(note) # store the html in the corresponding NoteMarkdown object nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0] nmd.html = html nmd.save() # manage the display counter = counter + 1 sys.stdout.write(' ') # track 20 notes per line if counter % 20 == 0: # finish off previous line and start new line display_counts(counter, max_notes) # flush per line, just in case it isn't outputting sys.stdout.flush() # perform migration in discrete chunks to deal with the transaction if counter == max_notes: break # Display the score print "Migrated {0} notes and failed to migrate {1} notes.".format( len(good), len(bad)) print "Of good notes, {0} are editable and {1} are not.".format( len(edit), len(nonedit)) print "Failed list:" for slug in bad: print slug