Example #1
0
    def test_image_data_uri(self):
        html = '<img src="">'
        s3ified = sanitizer.data_uris_to_s3(html)
        soup = BeautifulSoup(s3ified)
        regex = r'^https?://.*$'
        self.assertTrue(bool(re.match(regex, soup.img['src'])),
                        "{} does not match {}".format(s3ified, regex))

        resanitize = sanitizer.data_uris_to_s3(s3ified)
        self.assertHTMLEqual(s3ified, resanitize)
Example #2
0
    def test_image_data_uri(self):
        html = '<img src="">'
        s3ified = sanitizer.data_uris_to_s3(html)
        soup = BeautifulSoup(s3ified)
        regex = r'^https?://.*$'
        self.assertTrue(bool(re.match(regex, soup.img['src'])),
                "{} does not match {}".format(s3ified, regex))

        resanitize = sanitizer.data_uris_to_s3(s3ified)
        self.assertHTMLEqual(s3ified, resanitize)
Example #3
0
    def test_font_face_data_uri(self):
        # Note: this data-uri is not a valid font (it's the red dot).
        html = '''<style>@font-face { src: url('data:application/font-woff;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=='); }</style>'''

        s3ified = sanitizer.data_uris_to_s3(html)
        self.assertFalse(re.search(r"url\('data:application", s3ified),
                         "data URL not removed: {}".format(s3ified))
        self.assertTrue(re.search(r"url\('https?://[^\)]+\)", s3ified),
                        "URL not inserted: {}".format(s3ified))

        # Ensure that cleaning is idempotent.
        self.assertHTMLEqual(
            s3ified, sanitizer.sanitize_html_preserve_formatting(s3ified))
Example #4
0
    def test_font_face_data_uri(self):
        # Note: this data-uri is not a valid font (it's the red dot).
        html = '''<style>@font-face { src: url('data:application/font-woff;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=='); }</style>'''

        s3ified = sanitizer.data_uris_to_s3(html)
        self.assertFalse(re.search(r"url\('data:application", s3ified),
                "data URL not removed: {}".format(s3ified))
        self.assertTrue(re.search(r"url\('https?://[^\)]+\)", s3ified),
                "URL not inserted: {}".format(s3ified))

        # Ensure that cleaning is idempotent.
        self.assertHTMLEqual(s3ified,
                sanitizer.sanitize_html_preserve_formatting(s3ified))
Example #5
0
def convert_raw_document(raw_document, user=None):
    """ Upload a raw document to google drive and get a Note back"""
    fp_file = raw_document.get_file()

    # extract some properties from the document metadata
    filename = raw_document.name
    print "this is the mimetype of the document to check:"
    mimetype = raw_document.mimetype
    print mimetype
    print ""

    # A special case for Evernotes
    if raw_document.mimetype == 'text/enml':
        raw_document.mimetype = 'text/html'

    original_content = fp_file.read()

    # Include mimetype parameter if there is one to include
    extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
                  else {}
    media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
                                resumable=True, **extra_flags)

    service = build_api_service()

    # upload to google drive
    file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)

    # download from google drive
    content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)

    # this should have already happened, lets see why it hasn't
    raw_document.is_processed = True
    raw_document.save()

    note = raw_document.convert_to_note()

    # Cache the uploaded file's URL
    note.gdrive_url = file_dict['alternateLink']
    note.text = content_dict['text']

    # Extract HTML from the appropriate place
    html = ''
    if raw_document.mimetype == PDF_MIMETYPE:
        html = pdf2html(original_content)
    elif raw_document.mimetype in PPT_MIMETYPES:
        html = pdf2html(content_dict['pdf'])
    elif 'html' in content_dict and content_dict['html']:
        html = content_dict['html']

    if html:
        html = sanitizer.data_uris_to_s3(html)
        NoteMarkdown.objects.create(note=note, html=html)

    # If we know the user who uploaded this,
    # associate them with the note
    if user and not user.is_anonymous():
        note.user = user
        NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
    else:
        try:
            mapping = UserUploadMapping.objects.get(
                fp_file=raw_document.fp_file)
            note.user = mapping.user
            note.save()
            NoteKarmaEvent.create_event(mapping.user, note,
                                        NoteKarmaEvent.UPLOAD)
        except (ObjectDoesNotExist, MultipleObjectsReturned):
            logger.info("Zero or multiple mappings found with fp_file " +
                        raw_document.fp_file.name)

    # Finally, save whatever data we got back from google
    note.save()
Example #6
0
def convert_raw_document(raw_document, user=None):
    """ Upload a raw document to google drive and get a Note back"""
    fp_file = raw_document.get_file()

    # extract some properties from the document metadata
    filename = raw_document.name
    print "this is the mimetype of the document to check:"
    mimetype = raw_document.mimetype
    print mimetype
    print ""

    # A special case for Evernotes
    if raw_document.mimetype == 'text/enml':
        raw_document.mimetype = 'text/html'

    original_content = fp_file.read()

    # Include mimetype parameter if there is one to include
    extra_flags = {'mimetype': raw_document.mimetype} if raw_document.mimetype \
                  else {}
    media = MediaInMemoryUpload(original_content, chunksize=1024*1024, \
                                resumable=True, **extra_flags)


    service = build_api_service()

    # upload to google drive
    file_dict = upload_to_gdrive(service, media, filename, mimetype=mimetype)

    # download from google drive
    content_dict = download_from_gdrive(service, file_dict, mimetype=mimetype)

    # this should have already happened, lets see why it hasn't
    raw_document.is_processed = True
    raw_document.save()

    note = raw_document.convert_to_note()

    # Cache the uploaded file's URL
    note.gdrive_url = file_dict['alternateLink']
    note.text = content_dict['text']

    # Extract HTML from the appropriate place
    html = ''
    if raw_document.mimetype == PDF_MIMETYPE:
        html = pdf2html(original_content)
    elif raw_document.mimetype in PPT_MIMETYPES:
        html = pdf2html(content_dict['pdf'])
    elif 'html' in content_dict and content_dict['html']:
        html = content_dict['html']

    if html:
        html = sanitizer.data_uris_to_s3(html)
        NoteMarkdown.objects.create(note=note, html=html)

    # If we know the user who uploaded this,
    # associate them with the note
    if user and not user.is_anonymous():
        note.user = user
        NoteKarmaEvent.create_event(user, note, NoteKarmaEvent.UPLOAD)
    else:
        try:
            mapping = UserUploadMapping.objects.get(fp_file=raw_document.fp_file)
            note.user = mapping.user
            note.save()
            NoteKarmaEvent.create_event(mapping.user, note, NoteKarmaEvent.UPLOAD)
        except (ObjectDoesNotExist, MultipleObjectsReturned):
            logger.info("Zero or multiple mappings found with fp_file " + raw_document.fp_file.name)

    # Finally, save whatever data we got back from google
    note.save()
    def forwards(self, orm):
        "Write your forwards methods here."
        # Note: Don't use "from appname.models import ModelName". 
        # Use orm.ModelName to refer to models in this application,
        # and orm['appname.ModelName'] for models in other applications.

        # keep score. save as lists for debugging purposes if needed.
        good = []
        edit = []
        nonedit = []
        bad = []

        # at the time of migration, editable categories are limited to
        EDITABLE_CATEGORIES = ('LECTURE_NOTES',)

        # at the time of migration, translated PDFs were based on mimetypes
        PDF_MIMETYPES = (
          'application/pdf',
          'application/vnd.ms-powerpoint',
          'application/vnd.openxmlformats-officedocument.presentationml.presentation'
        )

        necessary_notes = orm['notes.Note'].objects.filter(notemarkdown__html__isnull=True)
        n_notes = necessary_notes.count()

        # perform migration in discrete chunks to deal with the transaction
        # (just delete the migration from the south table and run again)
        limitkey = 'NOTE_LIMIT_0021'
        sys.stdout.write('Running until ')
        if os.environ.has_key(limitkey):
            max_notes = int(os.environ[limitkey])
            display_counts(max_notes, n_notes)
        else:
            max_notes = n_notes
            display_counts(n_notes, n_notes)

        # visualiation to show how well this is moving through a large database.
        counter = 0
        display_counts(counter, max_notes)
        # find each Note without an html field, download its S3 html, and
        # store it in the local database.
        for note in necessary_notes:
            # download the s3 content
            html = ''
            # copy/pasted from model code for Note.get_relative_s3_path
            note_s3_path = 'html/{0}.html'.format(note.slug)
            sys.stdout.write(':')
            sys.stdout.flush()
            key = default_storage.bucket.get_key(note_s3_path)
            if key:
                html = key.read()

            # check the downloaded html
            if not html:
                sys.stdout.write('( ')
                bad.append(note.slug)
                counter = counter + 1
                continue
            else:
                good.append(note.slug)

            # clean the html in a consistent way with note uploads as of the
            # time of this migration.
            # handle embedded images from pdf2htmlEX or other sources
            html = sanitizer.data_uris_to_s3(html)
            if note.category in EDITABLE_CATEGORIES:
                # make HTML editable
                html = sanitizer.sanitize_html_to_editable(html)
                sys.stdout.write(']')
                edit.append(note)
            else:
                # clean up HTML without concern for editing
                html = sanitizer.sanitize_html_preserve_formatting(html)
                sys.stdout.write(')')
                nonedit.append(note)

            # store the html in the corresponding NoteMarkdown object
            nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0]
            nmd.html = html
            nmd.save()

            # manage the display
            counter = counter + 1
            sys.stdout.write(' ')
            # track 20 notes per line
            if counter % 20 == 0:
                # finish off previous line and start new line
                display_counts(counter, max_notes)
                # flush per line, just in case it isn't outputting
                sys.stdout.flush()

            # perform migration in discrete chunks to deal with the transaction
            if counter == max_notes:
                break

        # Display the score
        print "Migrated {0} notes and failed to migrate {1} notes.".format(
          len(good), len(bad))
        print "Of good notes, {0} are editable and {1} are not.".format(
          len(edit), len(nonedit))

        print "Failed list:"
        for slug in bad:
            print slug
Example #8
0
    def forwards(self, orm):
        "Write your forwards methods here."
        # Note: Don't use "from appname.models import ModelName".
        # Use orm.ModelName to refer to models in this application,
        # and orm['appname.ModelName'] for models in other applications.

        # keep score. save as lists for debugging purposes if needed.
        good = []
        edit = []
        nonedit = []
        bad = []

        # at the time of migration, editable categories are limited to
        EDITABLE_CATEGORIES = ('LECTURE_NOTES', )

        # at the time of migration, translated PDFs were based on mimetypes
        PDF_MIMETYPES = (
            'application/pdf', 'application/vnd.ms-powerpoint',
            'application/vnd.openxmlformats-officedocument.presentationml.presentation'
        )

        necessary_notes = orm['notes.Note'].objects.filter(
            notemarkdown__html__isnull=True)
        n_notes = necessary_notes.count()

        # perform migration in discrete chunks to deal with the transaction
        # (just delete the migration from the south table and run again)
        limitkey = 'NOTE_LIMIT_0021'
        sys.stdout.write('Running until ')
        if os.environ.has_key(limitkey):
            max_notes = int(os.environ[limitkey])
            display_counts(max_notes, n_notes)
        else:
            max_notes = n_notes
            display_counts(n_notes, n_notes)

        # visualiation to show how well this is moving through a large database.
        counter = 0
        display_counts(counter, max_notes)
        # find each Note without an html field, download its S3 html, and
        # store it in the local database.
        for note in necessary_notes:
            # download the s3 content
            html = ''
            # copy/pasted from model code for Note.get_relative_s3_path
            note_s3_path = 'html/{0}.html'.format(note.slug)
            sys.stdout.write(':')
            sys.stdout.flush()
            key = default_storage.bucket.get_key(note_s3_path)
            if key:
                html = key.read()

            # check the downloaded html
            if not html:
                sys.stdout.write('( ')
                bad.append(note.slug)
                counter = counter + 1
                continue
            else:
                good.append(note.slug)

            # clean the html in a consistent way with note uploads as of the
            # time of this migration.
            # handle embedded images from pdf2htmlEX or other sources
            html = sanitizer.data_uris_to_s3(html)
            if note.category in EDITABLE_CATEGORIES:
                # make HTML editable
                html = sanitizer.sanitize_html_to_editable(html)
                sys.stdout.write(']')
                edit.append(note)
            else:
                # clean up HTML without concern for editing
                html = sanitizer.sanitize_html_preserve_formatting(html)
                sys.stdout.write(')')
                nonedit.append(note)

            # store the html in the corresponding NoteMarkdown object
            nmd = orm['notes.NoteMarkdown'].objects.get_or_create(note=note)[0]
            nmd.html = html
            nmd.save()

            # manage the display
            counter = counter + 1
            sys.stdout.write(' ')
            # track 20 notes per line
            if counter % 20 == 0:
                # finish off previous line and start new line
                display_counts(counter, max_notes)
                # flush per line, just in case it isn't outputting
                sys.stdout.flush()

            # perform migration in discrete chunks to deal with the transaction
            if counter == max_notes:
                break

        # Display the score
        print "Migrated {0} notes and failed to migrate {1} notes.".format(
            len(good), len(bad))
        print "Of good notes, {0} are editable and {1} are not.".format(
            len(edit), len(nonedit))

        print "Failed list:"
        for slug in bad:
            print slug