def test_image_data_uri(self): html = '<img src="data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==">' s3ified = sanitizer.data_uris_to_s3(html) soup = BeautifulSoup(s3ified) regex = r'^https?://.*$' self.assertTrue(bool(re.match(regex, soup.img['src'])), "{} does not match {}".format(s3ified, regex)) resanitize = sanitizer.data_uris_to_s3(s3ified) self.assertHTMLEqual(s3ified, resanitize)
def test_font_face_data_uri(self): # Note: this data-uri is not a valid font (it's the red dot). html = '''<style>@font-face { src: url('data:application/font-woff;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=='); }</style>''' s3ified = sanitizer.data_uris_to_s3(html) self.assertFalse(re.search(r"url\('data:application", s3ified), "data URL not removed: {}".format(s3ified)) self.assertTrue(re.search(r"url\('https?://[^\)]+\)", s3ified), "URL not inserted: {}".format(s3ified)) # Ensure that cleaning is idempotent. self.assertHTMLEqual( s3ified, sanitizer.sanitize_html_preserve_formatting(s3ified))
def test_font_face_data_uri(self): # Note: this data-uri is not a valid font (it's the red dot). html = '''<style>@font-face { src: url('data:application/font-woff;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=='); }</style>''' s3ified = sanitizer.data_uris_to_s3(html) self.assertFalse(re.search(r"url\('data:application", s3ified), "data URL not removed: {}".format(s3ified)) self.assertTrue(re.search(r"url\('https?://[^\)]+\)", s3ified), "URL not inserted: {}".format(s3ified)) # Ensure that cleaning is idempotent. self.assertHTMLEqual(s3ified, sanitizer.sanitize_html_preserve_formatting(s3ified))
def convert_raw_document(raw_document, user=None): """ Upload a raw document to google drive and get a Note back""" fp_file = raw_document.get_file() # extract some properties from the document metadata filename = raw_document.name print "this is the mimetype of the document to check:" mimetype = raw_document.mimetype print mimetype print "" # A special case for Evernotes if raw_document.mimetype == 'text/enml': raw_document.mimetype = 'text/html' original_content = fp_file.read() # Include mimetype parameter if there is one to include extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \ else {} media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \ resumable=True, **extra_flags) service = build_api_service() # upload to google drive file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype) # download from google drive content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype) # this should have already happened, lets see why it hasn't raw_document.is_processed = True raw_document.save() note = raw_document.convert_to_note() # Cache the uploaded file's URL note.gdrive_url = file_dict['alternateLink'] note.text = content_dict['text'] # Extract HTML from the appropriate place html = '' if raw_document.mimetype == PDF_MIMETYPE: html = pdf2html(original_content) elif raw_document.mimetype in PPT_MIMETYPES: html = pdf2html(content_dict['pdf']) elif 'html' in content_dict and content_dict['html']: html = content_dict['html'] if html: html = sanitizer.data_uris_to_s3(html) NoteMarkdown.objects.create(note=note, html=html) # If we know the user who uploaded this, # associate them with the note if user and not user.is_anonymous(): note.user = user NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD) else: try: mapping = UserUploadMapping.objects.get( fp_file=raw_document.fp_file) note.user = mapping.user note.save() NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD) except (ObjectDoesNotExist, MultipleObjectsReturned): logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name) # Finally, save whatever data we got back from google note.save()
def convert_raw_document(raw_document, user=None): """ Upload a raw document to google drive and get a Note back""" fp_file = raw_document.get_file() # extract some properties from the document metadata filename = raw_document.name print "this is the mimetype of the document to check:" mimetype = raw_document.mimetype print mimetype print "" # A special case for Evernotes if raw_document.mimetype == 'text/enml': raw_document.mimetype = 'text/html' original_content = fp_file.read() # Include mimetype parameter if there is one to include extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \ else {} media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \ resumable=True, **extra_flags) service = build_api_service() # upload to google drive file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype) # download from google drive content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype) # this should have already happened, lets see why it hasn't raw_document.is_processed = True raw_document.save() note = raw_document.convert_to_note() # Cache the uploaded file's URL note.gdrive_url = file_dict['alternateLink'] note.text = content_dict['text'] # Extract HTML from the appropriate place html = '' if raw_document.mimetype == PDF_MIMETYPE: html = pdf2html(original_content) elif raw_document.mimetype in PPT_MIMETYPES: html = pdf2html(content_dict['pdf']) elif 'html' in content_dict and content_dict['html']: html = content_dict['html'] if html: html = sanitizer.data_uris_to_s3(html) NoteMarkdown.objects.create(note=note, html=html) # If we know the user who uploaded this, # associate them with the note if user and not user.is_anonymous(): note.user = user NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD) else: try: mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file) note.user = mapping.user note.save() NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD) except (ObjectDoesNotExist, MultipleObjectsReturned): logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name) # Finally, save whatever data we got back from google note.save()
def forwards(self, orm): "Write your forwards methods here." # Note: Don't use "from appname.models import ModelName". # Use orm.ModelName to refer to models in this application, # and orm['appname.ModelName'] for models in other applications. # keep score. save as lists for debugging purposes if needed. good = [] edit = [] nonedit = [] bad = [] # at the time of migration, editable categories are limited to EDITABLE_CATEGORIES = ('LECTURE_NOTES',) # at the time of migration, translated PDFs were based on mimetypes PDF_MIMETYPES = ( 'application/pdf', 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ) necessary_notes = orm['notes.Note'].objects.filter(notemarkdown__html__isnull=True) n_notes = necessary_notes.count() # perform migration in discrete chunks to deal with the transaction # (just delete the migration from the south table and run again) limitkey = 'NOTE_LIMIT_0021' sys.stdout.write('Running until ') if os.environ.has_key(limitkey): max_notes = int(os.environ[limitkey]) display_counts(max_notes, n_notes) else: max_notes = n_notes display_counts(n_notes, n_notes) # visualiation to show how well this is moving through a large database. counter = 0 display_counts(counter, max_notes) # find each Note without an html field, download its S3 html, and # store it in the local database. for note in necessary_notes: # download the s3 content html = '' # copy/pasted from model code for Note.get_relative_s3_path note_s3_path = 'html/{0}.html'.format(note.slug) sys.stdout.write(':') sys.stdout.flush() key = default_storage.bucket.get_key(note_s3_path) if key: html = key.read() # check the downloaded html if not html: sys.stdout.write('( ') bad.append(note.slug) counter = counter + 1 continue else: good.append(note.slug) # clean the html in a consistent way with note uploads as of the # time of this migration. # handle embedded images from pdf2htmlEX or other sources html = sanitizer.data_uris_to_s3(html) if note.category in EDITABLE_CATEGORIES: # make HTML editable html = sanitizer.sanitize_html_to_editable(html) sys.stdout.write(']') edit.append(note) else: # clean up HTML without concern for editing html = sanitizer.sanitize_html_preserve_formatting(html) sys.stdout.write(')') nonedit.append(note) # store the html in the corresponding NoteMarkdown object nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0] nmd.html = html nmd.save() # manage the display counter = counter + 1 sys.stdout.write(' ') # track 20 notes per line if counter % 20 == 0: # finish off previous line and start new line display_counts(counter, max_notes) # flush per line, just in case it isn't outputting sys.stdout.flush() # perform migration in discrete chunks to deal with the transaction if counter == max_notes: break # Display the score print "Migrated {0} notes and failed to migrate {1} notes.".format( len(good), len(bad)) print "Of good notes, {0} are editable and {1} are not.".format( len(edit), len(nonedit)) print "Failed list:" for slug in bad: print slug
def forwards(self, orm): "Write your forwards methods here." # Note: Don't use "from appname.models import ModelName". # Use orm.ModelName to refer to models in this application, # and orm['appname.ModelName'] for models in other applications. # keep score. save as lists for debugging purposes if needed. good = [] edit = [] nonedit = [] bad = [] # at the time of migration, editable categories are limited to EDITABLE_CATEGORIES = ('LECTURE_NOTES', ) # at the time of migration, translated PDFs were based on mimetypes PDF_MIMETYPES = ( 'application/pdf', 'application/vnd.ms-powerpoint', 'application/vnd.openxmlformats-officedocument.presentationml.presentation' ) necessary_notes = orm['notes.Note'].objects.filter( notemarkdown__html__isnull=True) n_notes = necessary_notes.count() # perform migration in discrete chunks to deal with the transaction # (just delete the migration from the south table and run again) limitkey = 'NOTE_LIMIT_0021' sys.stdout.write('Running until ') if os.environ.has_key(limitkey): max_notes = int(os.environ[limitkey]) display_counts(max_notes, n_notes) else: max_notes = n_notes display_counts(n_notes, n_notes) # visualiation to show how well this is moving through a large database. counter = 0 display_counts(counter, max_notes) # find each Note without an html field, download its S3 html, and # store it in the local database. for note in necessary_notes: # download the s3 content html = '' # copy/pasted from model code for Note.get_relative_s3_path note_s3_path = 'html/{0}.html'.format(note.slug) sys.stdout.write(':') sys.stdout.flush() key = default_storage.bucket.get_key(note_s3_path) if key: html = key.read() # check the downloaded html if not html: sys.stdout.write('( ') bad.append(note.slug) counter = counter + 1 continue else: good.append(note.slug) # clean the html in a consistent way with note uploads as of the # time of this migration. # handle embedded images from pdf2htmlEX or other sources html = sanitizer.data_uris_to_s3(html) if note.category in EDITABLE_CATEGORIES: # make HTML editable html = sanitizer.sanitize_html_to_editable(html) sys.stdout.write(']') edit.append(note) else: # clean up HTML without concern for editing html = sanitizer.sanitize_html_preserve_formatting(html) sys.stdout.write(')') nonedit.append(note) # store the html in the corresponding NoteMarkdown object nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0] nmd.html = html nmd.save() # manage the display counter = counter + 1 sys.stdout.write(' ') # track 20 notes per line if counter % 20 == 0: # finish off previous line and start new line display_counts(counter, max_notes) # flush per line, just in case it isn't outputting sys.stdout.flush() # perform migration in discrete chunks to deal with the transaction if counter == max_notes: break # Display the score print "Migrated {0} notes and failed to migrate {1} notes.".format( len(good), len(bad)) print "Of good notes, {0} are editable and {1} are not.".format( len(edit), len(nonedit)) print "Failed list:" for slug in bad: print slug