Python Pdfの例、examiner.models.Pdf Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

    def test_using_courses_from_url_in_classification(self):
        """Exam course classification should AND combine PDF and URL parsing."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text'))

        # 1 course in exam
        text = "Exsamen i TMA4000"
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # 3 additional courses in URL parsing
        urls = [
            'http://wiki.math.ntnu.no/TMA4100/exams/problems.pdf',
            'http://wiki.math.ntnu.no/TMA4200/exams/problems.pdf',
            'http://wiki.math.ntnu.no/TMA4300/exams/problems.pdf',
        ]
        for url in urls:
            PdfUrl.objects.create(url=url, scraped_pdf=pdf)

        # Results in 4 courses all together
        pdf.classify()
        assert pdf.exams.count() == 4
        assert (set(pdf.exams.values_list('course_code', flat=True)) == {
            'TMA4000', 'TMA4100', 'TMA4200', 'TMA4300'
        })

コード例 #2

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

    def test_combining_urls_and_content_for_classification(self):
        """Exam classification should OR combine PDF and URL parsing."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text'))
        text = """
            Exsamen i TMA4000
            Dato: USPESIFISERT
            Løsningsforslag
        """
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # The first URL is disregarded as the other two are more popular
        urls = [
            'http://wiki.math.ntnu.no/TMA4000/exams/2017_kont.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/h2018.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/2018h.pdf',
        ]
        for url in urls:
            pdf_url = PdfUrl.objects.create(url=url, scraped_pdf=pdf)
            assert pdf_url.exam.year and pdf_url.exam.season

        pdf.classify()
        assert pdf.exams.count() == 1

        exam = pdf.exams.first()
        assert exam.course_code == 'TMA4000'
        assert exam.solutions is True
        assert exam.language == Language.BOKMAL
        assert exam.year == 2018
        assert exam.season == Season.AUTUMN

コード例 #3

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

def test_deletion_of_file_on_delete(tmpdir, settings):
    """FileField file should be cleaned up on Pdf deletion."""
    # Create Pdf object with associated downloaded file
    sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6'
    pdf = Pdf(sha1_hash=sha1_hash)
    pdf.file.save(content=ContentFile('Exam text'), name=sha1_hash + '.pdf')

    # The file should now exist on disk
    filepath = Path(settings.MEDIA_ROOT, pdf.file.name)
    assert filepath.is_file()

    # But after deleting the model, the file should be cleaned as well
    pdf.delete()
    assert not filepath.is_file()

コード例 #4

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

    def test_classify_pdf_with_several_course_codes(self):
        """Several course codes should be supported for exam PDFs."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        text = """
            Exsamen i TMA4000/10 og TIØ4000
            Dato: 11.08.99
            Løsningsforslag
        """.encode('utf-8')
        content = ContentFile(text)
        pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True)
        PdfPage.objects.create(text=text, pdf=pdf, number=0)
        pdf.classify()
        assert (set(pdf.exams.values_list(
            'course_code', flat=True)) == {'TMA4000', 'TMA4010', 'TIØ4000'})

        for exam in pdf.exams.all():
            assert exam.year == 1999
            assert exam.season == Season.CONTINUATION
            assert exam.language == Language.BOKMAL
            assert exam.solutions is True

コード例 #5

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

def test_raising_validation_errors_of_wrong_sha1_formatting():
    """SHA1 hash format should be enforced, also for filenames."""
    with pytest.raises(ValidationError):
        Pdf.objects.create(sha1_hash='WRONG HASH FORMAT')

    correct_sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6'
    pdf = Pdf(sha1_hash=correct_sha1_hash)
    with pytest.raises(ValidationError):
        pdf.file.save(
            content=ContentFile(''),
            name=correct_sha1_hash.replace('4', '3') + '.pdf',
        )

コード例 #6

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

    def test_determining_solutions_of_exam_without_content(self):
        """Solutions should by OR determined."""
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile('Exam text'))

        text = "Bad OCR handwritten content"
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # Only one url contains solutions but it is completely trusted
        urls = [
            'http://wiki.math.ntnu.no/TMA4000/exams/problems1.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/problems2.pdf',
            'http://wiki.math.ntnu.no/TMA4000/exams/problems3_solutions.pdf',
        ]
        for url in urls:
            PdfUrl.objects.create(url=url, scraped_pdf=pdf)

        # Results in 4 courses all together
        pdf.classify()
        assert pdf.exams.count() == 1
        assert pdf.exams.first().solutions is True

コード例 #7

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

def test_string_content():
    """FileBackup PDFs should be parsable."""
    pdf_path = Path(__file__).parent / 'data' / 'matmod_exam_des_2017.pdf'
    pdf_content = ContentFile(pdf_path.read_bytes())
    sha1 = '0000000000000000000000000000000000000000'
    pdf_backup = Pdf(sha1_hash=sha1)
    pdf_backup.file.save(name=sha1 + '.pdf', content=pdf_content)
    pdf_backup.read_text()
    pdf_backup.save()

    pdf_backup.refresh_from_db()

    # Ensure unicode string
    assert isinstance(pdf_backup.text, str)

    # Check content with text property
    assert len(pdf_backup.text.split('\f')) == 6
    assert 'Rottman' in pdf_backup.text
    assert 'population model' in pdf_backup.text
    assert 'this is not in the exam' not in pdf_backup.text

    # Check associated PdfPage model objects
    pages = pdf_backup.pages.all()
    assert pages.count() == 6

    # The ordering should be based on page number
    for page_num, page in enumerate(pages):
        assert page.number == page_num
        assert page.confidence is None

    # And each page should containt content
    assert 'Rottman' in pages[0].text
    assert 'Rottman' not in pages[2].text

    assert 'population model' in pages[2].text
    assert 'population model' not in pages[0].text

コード例 #8

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

    def test_classifiying_bad_content(self):
        """Classification should handle onle Nones."""
        sha1_hash = '0000000000000000000000000000000000000000'
        text = 'Bad OCR!'
        pdf = Pdf(sha1_hash=sha1_hash)
        pdf.file.save(sha1_hash + '.pdf', ContentFile(text))
        PdfPage.objects.create(text=text, pdf=pdf, number=0)

        # First handle bad OCR without any URLs
        pdf.classify()
        assert pdf.exams.count() == 1

        exam = pdf.exams.first()
        assert exam.solutions is False
        assert exam.course_code is None
        assert exam.language is None
        assert exam.year is None
        assert exam.season is None

        # And handle bad OCR with bad URLs
        urls = [
            'http://bad.url/1.pdf',
            'http://bad.url/2.pdf',
            'http://bad.url/3.pdf',
        ]
        for url in urls:
            PdfUrl.objects.create(url=url, scraped_pdf=pdf)

        pdf.classify()
        assert pdf.exams.count() == 1

        exam = pdf.exams.first()
        assert exam.solutions is False
        assert exam.course_code is None
        assert exam.language is None
        assert exam.year is None
        assert exam.season is None

コード例 #9

0

ファイルを表示

ファイル: test_views.py プロジェクト: JakobGM/WikiLinks

def test_verify_pdf_view(admin_client):
    """Test PDF verification of specific model objects."""
    # We have one PDF
    sha1_hash = '0000000000000000000000000000000000000000'
    pdf = Pdf(sha1_hash=sha1_hash)
    content = ContentFile('exam text')
    pdf.file.save(name=sha1_hash + '.pdf', content=content)

    # And another one
    sha1_hash2 = '1111111111111111111111111111111111111111'
    pdf2 = Pdf(sha1_hash=sha1_hash2)
    content2 = ContentFile('exam text')
    pdf2.file.save(name=sha1_hash2 + '.pdf', content=content2)

    # Both PDFs are connected to the same exam
    exam = DocumentInfo.objects.create()
    DocumentInfoSource.objects.create(pdf=pdf, document_info=exam)
    DocumentInfoSource.objects.create(pdf=pdf2, document_info=exam)

    # And each resolves to a view with their own PDF as context
    response = admin_client.get(pdf.get_absolute_url())
    assert response.context['pdf'] == pdf
    response2 = admin_client.get(pdf2.get_absolute_url())
    assert response2.context['pdf'] == pdf2

コード例 #10

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

    def test_classify_pdf(self):
        """Exam type should be determinable from pdf content."""
        # The PDF contains the following content
        sha1_hash = '0000000000000000000000000000000000000000'
        pdf = Pdf(sha1_hash=sha1_hash)
        text = """
            NTNU TMA4115 Matematikk 3
            Institutt for matematiske f*g
            eksamen 11.08.05
            Eksamenssettet har 12 punkter.
        """
        content = ContentFile(text)
        pdf.file.save(content=content, name=sha1_hash + '.pdf', save=True)

        # No errors should be raised when no pages has been saved yet, but
        # False should be returned to indicate a lack of success.
        assert pdf.classify(
            allow_ocr=True) is False  # Malformed plain text PDF
        assert pdf.classify(allow_ocr=False) is False
        assert pdf.exams.count() == 0

        # But now we add a cover page and classify its content
        PdfPage.objects.create(text=text, pdf=pdf, number=0)
        pdf.refresh_from_db()
        assert pdf.classify() is True

        # And all metadata should be saved
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 1
        exam = pdf.exams.first()
        assert exam.language == Language.BOKMAL
        assert exam.course_code == 'TMA4115'
        assert exam.solutions is False
        assert exam.year == 2005
        assert exam.season == Season.CONTINUATION
        assert exam.content_type == DocumentInfo.EXAM

        # When the classification method changes it result, old results
        # should be removed. This is simulated here by mutating the exam.
        exam.year == 1999
        exam.save()
        pdf.classify()
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 1
        assert pdf.exams.first().year == 2005

        # But verified exams should NOT be deleted
        verified_exam = DocumentInfo.objects.create(
            year=1999,
            course_code=exam.course_code,
            language=exam.language,
            solutions=exam.solutions,
        )
        user = UserFactory.create(username='******')
        verified_exam_pdf = DocumentInfoSource.objects.create(
            document_info=verified_exam,
            pdf=pdf,
        )
        verified_exam_pdf.verified_by.add(user)
        pdf.classify()
        pdf = Pdf.objects.get(id=pdf.id)
        assert pdf.exams.count() == 2

        # If a PDF has already verified document info, classify should by a
        # no-op.
        DocumentInfoSource.objects.all().delete()
        verified_exam_pdf = DocumentInfoSource.objects.create(
            document_info=verified_exam,
            pdf=pdf,
        )
        verified_exam_pdf.verified_by.add(user)
        pdf.classify(save=True)
        assert pdf.exams.count() == 1
        assert pdf.exams.first() == verified_exam

コード例 #11

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

def test_get_absolute_url_for_pdf():
    """PDF absolute URL should point to verify form view for that pdf."""
    sha1_hash = '4dc828ea76ab618be6d72d135af13c40de3b9ce6'
    pdf = Pdf(sha1_hash=sha1_hash)
    assert pdf.get_absolute_url() == '/exams/verify/' + sha1_hash

コード例 #12

0

ファイルを表示

ファイル: test_models.py プロジェクト: JakobGM/WikiLinks

def test_queryset_organize_method():
    """ExamURLs should be organizable in hierarchy."""
    # All links are related to this course
    CourseFactory(
        full_name='Mathematics 1',
        display_name='Maths 1',
        course_code='TMA4000',
    )

    exam1 = DocumentInfo.objects.create(
        course_code='TMA4000',
        year=2016,
        season=Season.SPRING,
        language=Language.ENGLISH,
    )
    exam_url1 = PdfUrl.objects.create(
        url='http://exams.com/exam',
        exam=exam1,
    )

    exam1_solutions = DocumentInfo.objects.create(
        course_code='TMA4000',
        year=2016,
        season=Season.SPRING,
        solutions=True,
        language=Language.ENGLISH,
    )
    exam_url_solutions = PdfUrl.objects.create(url='http://exams.com/solution',
                                               exam=exam1_solutions)

    eksamen_losning = DocumentInfo.objects.create(
        course_code='TMA4000',
        year=2016,
        season=Season.SPRING,
        solutions=True,
        language=Language.BOKMAL,
    )
    eksamen_url_losning = PdfUrl.objects.create(
        url='http://exams.com/losning',
        exam=eksamen_losning,
    )

    # The URL classifier could not determine the language
    url_exam_2015 = DocumentInfo.objects.create(
        course_code='TMA4000',
        year=2015,
        season=Season.SPRING,
        solutions=False,
        language=Language.UNKNOWN,
    )

    # But the PDF classifier managed to determine it
    pdf_exam_2015 = DocumentInfo.objects.create(
        course_code='TMA4000',
        year=2015,
        season=Season.SPRING,
        solutions=False,
        language=Language.ENGLISH,
    )
    sha1_hash = '0000000000000000000000000000000000000000'
    exam_2015_pdf = Pdf(sha1_hash=sha1_hash)
    exam_2015_pdf.file.save(sha1_hash + '.pdf', ContentFile('exam text'))
    DocumentInfoSource.objects.create(
        document_info=pdf_exam_2015,
        pdf=exam_2015_pdf,
    )

    # The pdf is scraped
    exam_2015_url = PdfUrl.objects.create(
        url='http://exams.com/exam_2015',
        exam=url_exam_2015,
        scraped_pdf=exam_2015_pdf,
    )

    organization = DocumentInfo.objects.all().organize()
    assert organization == {
        'TMA4000': {
            'full_name': 'Mathematics 1',
            'nick_name': 'Maths 1',
            'years': {
                2016: {
                    'Vår': {
                        'exams': {
                            'Engelsk': [exam_url1]
                        },
                        'solutions': {
                            'Bokmål': [eksamen_url_losning],
                            'Engelsk': [exam_url_solutions],
                        },
                    },
                },
                2015: {
                    'Vår': {
                        'exams': {
                            'Engelsk': [exam_2015_url]
                        },
                        'solutions': {},
                    },
                },
            },
        },
    }

コード例 #13

0

ファイルを表示

ファイル: test_views.py プロジェクト: JakobGM/WikiLinks

def test_verify_random_pdf_view(client, django_user_model):
    """Test PDF verification view."""
    # We have one PDF
    sha1_hash = '0000000000000000000000000000000000000000'
    pdf = Pdf(sha1_hash=sha1_hash)
    content = ContentFile('exam text')
    pdf.file.save(name=sha1_hash + '.pdf', content=content)

    # And three courses
    course1 = CourseFactory(course_code='TMA1000')
    course2 = CourseFactory(course_code='TMA2000')
    course3 = CourseFactory(course_code='TMA3000')

    # The PDF has been inferred to contain the two first of these courses
    common_docinfo_attrs = {
        'language': 'Bokmål',
        'year': 2010,
        'solutions': False,
        'content_type': 'Exam',
    }
    exam1 = DocumentInfo.objects.create(
        course=course1,
        **common_docinfo_attrs,
    )
    DocumentInfoSource.objects.create(pdf=pdf, document_info=exam1)

    exam2 = DocumentInfo.objects.create(
        course=course2,
        **common_docinfo_attrs,
    )
    DocumentInfoSource.objects.create(pdf=pdf, document_info=exam2)

    # We verify a random PDF in this case our PDF since there is only one
    user = django_user_model.objects.create_user(username='******', password='******')
    client.login(username='******', password='******')

    url = reverse('examiner:verify_random')
    response = client.get(url)
    assert response.status_code == 200

    # The form instance is populated with the first exam
    form = response.context['form']
    data = form.initial
    assert form.instance == exam1
    assert data['language'] == 'Bokmål'
    assert data['pdf'] == pdf
    assert data['season'] is None
    assert data['verifier'] == user

    # But both courses appear in the courses field
    assert data['courses'].count() == 2
    assert set(data['courses']) == {course1.id, course2.id}

    # The user now changes the 2 courses
    form = VerifyExamForm({
        'courses': [course2.id, course3.id],
        'pdf': pdf.id,
        'verifier': user.id,
        **common_docinfo_attrs,
    })
    assert form.is_valid()
    response = client.post(url, form.data)
    assert response.status_code == 302

    # We have two new verified exams
    verified_exams = DocumentInfoSource.objects.filter(verified_by__in=[user])
    assert verified_exams.count() == 2

    # Both are connected to our pdf
    exam_pdf1 = verified_exams.first()
    exam_pdf2 = verified_exams.last()
    assert exam_pdf1.pdf == pdf
    assert exam_pdf2.pdf == pdf
    assert exam_pdf1.verified_by.first() == user
    assert exam_pdf2.verified_by.first() == user

    # With two different courses
    docinfo1 = exam_pdf1.document_info
    docinfo2 = exam_pdf2.document_info
    assert docinfo1.course == course2
    assert docinfo2.course == course3

    # But all other attributes being equeal
    for key, value in common_docinfo_attrs.items():
        assert getattr(docinfo1, key) == value
        assert getattr(docinfo2, key) == value

    # The two other unverified infos have now been removed
    assert not DocumentInfoSource.objects.filter(
        verified_by__isnull=True, ).exists()

    # And we have alltogether 3 DocumentInfo objects
    assert DocumentInfo.objects.count() == 3

    # And only two through relations
    assert DocumentInfoSource.objects.count() == 2